Re: Add zstd support to libbacktrace

2022-12-16 Thread Ian Lance Taylor via Gcc-patches
Some more tweaks of the libbacktrace zstd decompressor to make
decompressing slightly faster: unpack all the literal data into the
output buffer, rather than using scratch space.  Bootstrapped and ran
libbacktrace tests on x86_64-pc-linux-gnu.  Committed to mainline.

Ian

* elf.c (elf_fetch_backward_init): New static function.
(ZSTD_TABLE_SIZE): Use huffman scratch space size rather than
literal size.
(ZSTD_TABLE_WORK_LIT_SIZE): Don't define.
(elf_zstd_read_huff): Use elf_fetch_backward_init.
(elf_zstd_read_literals): New static function.
(ZSTD_LIT_RAW, ZSTD_LIT_RLE, ZSTD_LIT_HUFF): Don't define.
(struct elf_zstd_literals): Don't define.
(elf_zstd_literal_output): Remove static function.
(elf_zstd_decompress): Use elf_fetch_backward_init and
elf_zstd_read_literals.  Rewrite literal copying.<
diff --git a/libbacktrace/elf.c b/libbacktrace/elf.c
index ece02db27f1..135a94245a4 100644
--- a/libbacktrace/elf.c
+++ b/libbacktrace/elf.c
@@ -1223,6 +1223,57 @@ elf_fetch_bits_backward (const unsigned char **ppin,
   return 1;
 }
 
+/* Initialize backward fetching when the bitstream starts with a 1 bit in the
+   last byte in memory (which is the first one that we read).  This is used by
+   zstd decompression.  Returns 1 on success, 0 on error.  */
+
+static int
+elf_fetch_backward_init (const unsigned char **ppin,
+const unsigned char *pinend,
+uint64_t *pval, unsigned int *pbits)
+{
+  const unsigned char *pin;
+  unsigned int stream_start;
+  uint64_t val;
+  unsigned int bits;
+
+  pin = *ppin;
+  stream_start = (unsigned int)*pin;
+  if (unlikely (stream_start == 0))
+{
+  elf_uncompress_failed ();
+  return 0;
+}
+  val = 0;
+  bits = 0;
+
+  /* Align to a 32-bit boundary.  */
+  while uintptr_t)pin) & 3) != 0)
+{
+  val <<= 8;
+  val |= (uint64_t)*pin;
+  bits += 8;
+  --pin;
+}
+
+  val <<= 8;
+  val |= (uint64_t)*pin;
+  bits += 8;
+
+  *ppin = pin;
+  *pval = val;
+  *pbits = bits;
+  if (!elf_fetch_bits_backward (ppin, pinend, pval, pbits))
+return 0;
+
+  *pbits -= __builtin_clz (stream_start) - (sizeof (unsigned int) - 1) * 8 + 1;
+
+  if (!elf_fetch_bits_backward (ppin, pinend, pval, pbits))
+return 0;
+
+  return 1;
+}
+
 /* Huffman code tables, like the rest of the zlib format, are defined
by RFC 1951.  We store a Huffman code table as a series of tables
stored sequentially in memory.  Each entry in a table is 16 bits.
@@ -2617,14 +2668,13 @@ elf_zlib_inflate_and_verify (const unsigned char *pin, 
size_t sin,
- scratch space, one of
  - to build an FSE table: 512 uint16_t values == 1024 bytes
  - to build a Huffman tree: 512 uint16_t + 256 uint32_t == 2048 bytes
- - buffer for literal values == 2048 bytes
 */
 
 #define ZSTD_TABLE_SIZE\
   (2 * 512 * sizeof (struct elf_zstd_fse_baseline_entry)   \
+ 256 * sizeof (struct elf_zstd_fse_baseline_entry) \
+ 2048 * sizeof (uint16_t)  \
-   + 2048)
+   + 512 * sizeof (uint16_t) + 256 * sizeof (uint32_t))
 
 #define ZSTD_TABLE_LITERAL_FSE_OFFSET (0)
 
@@ -2642,8 +2692,6 @@ elf_zlib_inflate_and_verify (const unsigned char *pin, 
size_t sin,
 #define ZSTD_TABLE_WORK_OFFSET \
   (ZSTD_TABLE_HUFFMAN_OFFSET + 2048 * sizeof (uint16_t))
 
-#define ZSTD_TABLE_WORK_LIT_SIZE 2048
-
 /* An entry in a zstd FSE table.  */
 
 struct elf_zstd_fse_entry
@@ -3427,7 +3475,6 @@ elf_zstd_read_huff (const unsigned char **ppin, const 
unsigned char *pinend,
   uint16_t *scratch;
   const unsigned char *pfse;
   const unsigned char *pback;
-  unsigned char stream_start;
   uint64_t val;
   unsigned int bits;
   unsigned int state1, state2;
@@ -3454,31 +3501,8 @@ elf_zstd_read_huff (const unsigned char **ppin, const 
unsigned char *pinend,
 FSE_TABLE.  */
 
   pback = pin + hdr - 1;
-  stream_start = *pback;
-  if (unlikely (stream_start == 0))
-   {
- elf_uncompress_failed ();
- return 0;
-   }
-  val = 0;
-  bits = 0;
-  while uintptr_t)pback) & 3) != 0)
-   {
- val <<= 8;
- val |= (uint64_t)*pback;
- bits += 8;
- --pback;
-   }
-  val <<= 8;
-  val |= (uint64_t)*pback;
-  bits += 8;
-
-  if (!elf_fetch_bits_backward (, pfse, , ))
-   return 0;
-
-  bits -= __builtin_clz (stream_start) - 24 + 1;
 
-  if (!elf_fetch_bits_backward (, pfse, , ))
+  if (!elf_fetch_backward_init (, pfse, , ))
return 0;
 
   bits -= fse_table_bits;
@@ -3702,331 +3726,615 @@ elf_zstd_read_huff (const unsigned char **ppin, const 
unsigned char *pinend,
   return 1;
 }
 
-/* The information used to decompress a sequence code, which can be a literal
-   length, an offset, or a match length.  */
+/* Read and decompress the literals and store them ending at POUTEND.  This
+   works because we are going to use all the 

Re: Re: [PATCH] RISC-V: Fix RVV mask mode size

2022-12-16 Thread 钟居哲
>> Most likely than not you end up loading a larger quantity with the high
>> bits zero'd.  Interesting that we're using a packed model.  I'd been
>> told it was fairly expensive to implement in hardware relative to teh
>> cost of implementing the sparse model.

>> I'm a bit confused by this.  GCC can support single bit bools, though
>> ports often extend them to 8 bits or more for computational efficiency
>> purposes.  At least that's the case in general.  Is there something
>> particularly special about masks & bools that's causing problems?
I am not sure I am on the same page with you. I don't understand what is the
sparse model you said. The only thing I do in this patch is that we change the 
BYTESIZE VNx1BI for example
as the BYTESIZE of VNx1BI (Original I adjust all mask modes same size as 
VNx8QImode like LLVM). 
And I print the GET_MODE_SIZE (VNx1BI) the value is the same as VNx1QImode so I 
assume because GCC model 1-bool same as 1-QI???
Actually I not sure but I am sure after this patch, VNx1BI is adjusted smaller 
size.

Adjusting mask modes as smaller size always beneficial, since we can use vlm && 
vsm in register spilling, it can reduce the memory consuming and
load store hardware bandwidth.

Unlike LLVM, LLVM make each fractional vector and mask vector same size as LMUL 
=1 so they use vl1r/vs1r to do the register spilling which is not
optimal.


juzhe.zh...@rivai.ai
 
From: Jeff Law
Date: 2022-12-17 09:53
To: 钟居哲; gcc-patches
CC: kito.cheng; palmer
Subject: Re: [PATCH] RISC-V: Fix RVV mask mode size
 
 
On 12/16/22 18:44, 钟居哲 wrote:
> Yes, VNx4DF only has 4 bit in mask mode in case of load and store.
> For example vlm or vsm we will load store 8-bit ??? (I am not sure 
> hardward can load store 4bit,but I am sure it definetly not load store 
> the whole register size)
Most likely than not you end up loading a larger quantity with the high 
bits zero'd.  Interesting that we're using a packed model.  I'd been 
told it was fairly expensive to implement in hardware relative to teh 
cost of implementing the sparse model.
 
> So ideally it should be model more accurate. However, since GCC assumes 
> that 1 BOOL is 1-byte, the only thing I do is to model mask mode as 
> smallest as possible.
> Maybe in the future, I can support 1BOOL for 1-bit?? I am not sure since 
> it will need to change GCC framework.
I'm a bit confused by this.  GCC can support single bit bools, though 
ports often extend them to 8 bits or more for computational efficiency 
purposes.  At least that's the case in general.  Is there something 
particularly special about masks & bools that's causing problems?
 
Jeff
 


Re: [PATCH] RISC-V: Fix RVV mask mode size

2022-12-16 Thread Jeff Law via Gcc-patches




On 12/16/22 18:44, 钟居哲 wrote:

Yes, VNx4DF only has 4 bit in mask mode in case of load and store.
For example vlm or vsm we will load store 8-bit ??? (I am not sure 
hardward can load store 4bit,but I am sure it definetly not load store 
the whole register size)
Most likely than not you end up loading a larger quantity with the high 
bits zero'd.  Interesting that we're using a packed model.  I'd been 
told it was fairly expensive to implement in hardware relative to teh 
cost of implementing the sparse model.


So ideally it should be model more accurate. However, since GCC assumes 
that 1 BOOL is 1-byte, the only thing I do is to model mask mode as 
smallest as possible.
Maybe in the future, I can support 1BOOL for 1-bit?? I am not sure since 
it will need to change GCC framework.
I'm a bit confused by this.  GCC can support single bit bools, though 
ports often extend them to 8 bits or more for computational efficiency 
purposes.  At least that's the case in general.  Is there something 
particularly special about masks & bools that's causing problems?


Jeff


Re: Re: [PATCH] RISC-V: Fix RVV machine mode attribute configuration

2022-12-16 Thread 钟居哲
Actually, I don't check and test HF carefully since I disable them.
Kito ask me to disable all HF modes since zvfhmin is no ratified and GCC
doesn't allow any un-ratified ISA. You can see vector-iterator.md that all
RVV modes supported including QI HI SI DI SF DF excluding HF and BF.



juzhe.zh...@rivai.ai
 
From: Jeff Law
Date: 2022-12-17 09:48
To: juzhe.zhong; gcc-patches
CC: kito.cheng; palmer
Subject: Re: [PATCH] RISC-V: Fix RVV machine mode attribute configuration
 
 
On 12/14/22 00:01, juzhe.zh...@rivai.ai wrote:
> From: Ju-Zhe Zhong 
> 
> The attribute configuration of each machine mode are support in the previous 
> patch.
> I noticed some of them are not correct during VSETVL PASS testsing.
> Correct them in the single patch now.
> 
> gcc/ChangeLog:
> 
>  * config/riscv/riscv-vector-switch.def (ENTRY): Correct attributes.
> 
 
 
 
> @@ -121,7 +121,7 @@ ENTRY (VNx2HI, true, LMUL_1, 16, LMUL_F2, 32)
>   ENTRY (VNx1HI, true, LMUL_F2, 32, LMUL_F4, 64)
>   
>   /* TODO:Disable all FP16 vector, enable them when 'zvfh' is supported.  */
> -ENTRY (VNx32HF, false, LMUL_8, 2, LMUL_RESERVED, 0)
> +ENTRY (VNx32HF, false, LMUL_RESERVED, 0, LMUL_8, 2)
Is there any value in making VNx32HF dependent on TARGET_MIN_VLEN > 32 
like we're doing for VNx32HI?   In the past I've found it useful to have 
HI, HF, BF behave identically as much as possible.
 
You call.  The patch is OK either way.
 
jeff
 
 
 
 


Re: [PATCH] RISC-V: Fix RVV machine mode attribute configuration

2022-12-16 Thread Jeff Law via Gcc-patches




On 12/14/22 00:01, juzhe.zh...@rivai.ai wrote:

From: Ju-Zhe Zhong 

The attribute configuration of each machine mode are support in the previous 
patch.
I noticed some of them are not correct during VSETVL PASS testsing.
Correct them in the single patch now.

gcc/ChangeLog:

 * config/riscv/riscv-vector-switch.def (ENTRY): Correct attributes.






@@ -121,7 +121,7 @@ ENTRY (VNx2HI, true, LMUL_1, 16, LMUL_F2, 32)
  ENTRY (VNx1HI, true, LMUL_F2, 32, LMUL_F4, 64)
  
  /* TODO:Disable all FP16 vector, enable them when 'zvfh' is supported.  */

-ENTRY (VNx32HF, false, LMUL_8, 2, LMUL_RESERVED, 0)
+ENTRY (VNx32HF, false, LMUL_RESERVED, 0, LMUL_8, 2)
Is there any value in making VNx32HF dependent on TARGET_MIN_VLEN > 32 
like we're doing for VNx32HI?   In the past I've found it useful to have 
HI, HF, BF behave identically as much as possible.


You call.  The patch is OK either way.

jeff





Re: Re: [PATCH] RISC-V: Fix RVV mask mode size

2022-12-16 Thread 钟居哲
Yes, VNx4DF only has 4 bit in mask mode in case of load and store.
For example vlm or vsm we will load store 8-bit ??? (I am not sure hardward can 
load store 4bit,but I am sure it definetly not load store the whole register 
size)
So ideally it should be model more accurate. However, since GCC assumes that 1 
BOOL is 1-byte, the only thing I do is to model mask mode as smallest as 
possible.
Maybe in the future, I can support 1BOOL for 1-bit?? I am not sure since it 
will need to change GCC framework.



juzhe.zh...@rivai.ai
 
From: Jeff Law
Date: 2022-12-17 04:22
To: juzhe.zhong; gcc-patches
CC: kito.cheng; palmer
Subject: Re: [PATCH] RISC-V: Fix RVV mask mode size
 
 
On 12/13/22 23:48, juzhe.zh...@rivai.ai wrote:
> From: Ju-Zhe Zhong 
> 
> This patch is to fix RVV mask modes size. Since mask mode size are adjust
> as a whole RVV register size LMUL = 1 which not only make each mask type for
> example vbool32_t tied to vint8m1_t but also increase memory consuming.
> 
> I notice this issue during development of VSETVL PASS. Since it is not part of
> VSETVL support, I seperate it into a single fix patch now.
> 
> gcc/ChangeLog:
> 
>  * config/riscv/riscv-modes.def (ADJUST_BYTESIZE): Reduce RVV mask 
> mode size.
>  * config/riscv/riscv.cc (riscv_v_adjust_bytesize): New function.
>  (riscv_modes_tieable_p): Don't tie mask modes which will create 
> issue.
>  * config/riscv/riscv.h (riscv_v_adjust_bytesize): New function.
So I haven't really studied the masking model for RVV (yet).  But 
there's two models that I'm generally aware of.
 
One model has a bit per element in the vector we're operating on.  So a 
V4DF will have 4 bits in the mask.  I generally call this the dense or 
packed model.
 
The other model has a bit for every element for the maximal number of 
elements that can ever appear in a vector.  So if we support an element 
length of 8bits and a 1kbit vector, then the sparse model would have 128 
bits regardless of the size of the object being operated on.  So we'd 
still have 128 bits for V4DF, but the vast majority would be don't cares.
 
 
ISTM that you're trying to set the mode size to the smallest possible 
which would seem to argue that you want the dense/packed mask model. 
Does that actually match what the hardware does?  If not, then don't we 
need to convert back and forth?
 
 
Or maybe I'm missing something here?!?
 
Jeff
 
 


Re: [PATCH] RISC-V: Add testcases for VSETVL PASS

2022-12-16 Thread Jeff Law via Gcc-patches




On 12/16/22 18:31, 钟居哲 wrote:
Register allocation (RA) doesn't affect the assembler checks since I 
relax the registers in assmebler checks,

all assmebler checks have their own goal. For example:

The code like this:

+void foo2 (void * restrict in, void * restrict out, int n)
+{
+  for (int i = 0; i < n; i++)
+{
+  vuint16mf4_t v = *(vuint16mf4_t*)(in + i);
+  *(vuint16mf4_t*)(out + i) = v;
+}
+}

Assembler check:

scan-assembler-times 
{vsetvli\s+(?:ra|[sgtf]p|t[0-6]|s[0-9]|s10|s11|a[0-7]),\s*zero,\s*e16,\s*mf4,\s*t[au],\s*m[au]\s+\.L[0-9]\:\s+vle16\.v\s+(?:v[0-9]|v[1-2][0-9]|v3[0-1]),0\s*\((?:ra|[sgtf]p|t[0-6]|s[0-9]|s10|s11|a[0-7])\

I don't care about which vector register is using since I relax register in 
assembler : (?:v[0-9]|v[1-2][0-9]|v3[0-1]), this means any vector register 
v0-v31

But also I relax scalar register : (?:ra|[sgtf]p|t[0-6]|s[0-9]|s10|s11|a[0-7]), 
so could be any x0 - x31 of them.

The only strict check is that make sure the vsetvl is hoist outside the loop 
meaning the location of vsetvl is outside of the Lable L[0-9]:

vsetvli\s+(?:ra|[sgtf]p|t[0-6]|s[0-9]|s10|s11|a[0-7]),\s*zero,\s*e16,\s*mf4,\s*t[au],\s*m[au]\s+\.L[0-9]

You can see the last assembler is \s+\.L[0-9] to make sure VSETVL PASS 
successfully do the optimization that hoist the vsetvl instruction outside the 
loop

I try to use check-function-body but it fails since it can not recognize the 
Lable which is most important for such cases.
Ah, I should have looked at those regexps closer.  Understood about the 
checking for hoisting the vsetvl.  Though it makes me wonder if we'd be 
better off dumping information out of the vsetvl pass.


In the case of hoisting we could dump the loop nest of the original 
evaluation block and the loop nest of the new vsetvl location, then we 
scan for those in the vsetvl pass dump.  While it doesn't check the 
assembly code, it's probably just as good if not better.


Consider that as an alternative.  But I'm not going to insist on it.  I 
just know we've had a lot of trouble through the years where assembly 
code changes slightly, causing test fails.  So I try to avoid too much 
assembly scanning if it can be avoided.  Often the easiest way to get 
the same basic effect is to dump at the transformation point and scan 
for those markers in the dump file.


Jeff


Re: Re: [PATCH] RISC-V: Add testcases for VSETVL PASS

2022-12-16 Thread 钟居哲
Register allocation (RA) doesn't affect the assembler checks since I relax the 
registers in assmebler checks,
all assmebler checks have their own goal. For example:

The code like this:
+void foo2 (void * restrict in, void * restrict out, int n)
+{
+  for (int i = 0; i < n; i++)
+{
+  vuint16mf4_t v = *(vuint16mf4_t*)(in + i);
+  *(vuint16mf4_t*)(out + i) = v;
+}
+}
Assembler check:
scan-assembler-times 
{vsetvli\s+(?:ra|[sgtf]p|t[0-6]|s[0-9]|s10|s11|a[0-7]),\s*zero,\s*e16,\s*mf4,\s*t[au],\s*m[au]\s+\.L[0-9]\:\s+vle16\.v\s+(?:v[0-9]|v[1-2][0-9]|v3[0-1]),0\s*\((?:ra|[sgtf]p|t[0-6]|s[0-9]|s10|s11|a[0-7])\I
 don't care about which vector register is using since I relax register in 
assembler : (?:v[0-9]|v[1-2][0-9]|v3[0-1]), this means any vector register 
v0-v31But also I relax scalar register : 
(?:ra|[sgtf]p|t[0-6]|s[0-9]|s10|s11|a[0-7]), so could be any x0 - x31 of 
them.The only strict check is that make sure the vsetvl is hoist outside the 
loop meaning the location of vsetvl is outside of the Lable 
L[0-9]:vsetvli\s+(?:ra|[sgtf]p|t[0-6]|s[0-9]|s10|s11|a[0-7]),\s*zero,\s*e16,\s*mf4,\s*t[au],\s*m[au]\s+\.L[0-9]You
 can see the last assembler is \s+\.L[0-9] to make sure VSETVL PASS 
successfully do the optimization that hoist the vsetvl instruction outside the 
loopI try to use check-function-body but it fails since it can not recognize 
the Lable which is most important for such cases.


juzhe.zh...@rivai.ai
 
From: Jeff Law
Date: 2022-12-17 04:07
To: juzhe.zhong; gcc-patches
CC: kito.cheng; palmer
Subject: Re: [PATCH] RISC-V: Add testcases for VSETVL PASS
 
 
On 12/14/22 01:09, juzhe.zh...@rivai.ai wrote:
> From: Ju-Zhe Zhong 
> 
> gcc/testsuite/ChangeLog:
> 
>  * gcc.target/riscv/rvv/rvv.exp: Adjust to enable tests for VSETVL 
> PASS.
>  * gcc.target/riscv/rvv/vsetvl/dump-1.c: New test.
>  * gcc.target/riscv/rvv/vsetvl/vlmax_single_block-1.c: New test.
>  * gcc.target/riscv/rvv/vsetvl/vlmax_single_block-10.c: New test.
>  * gcc.target/riscv/rvv/vsetvl/vlmax_single_block-11.c: New test.
>  * gcc.target/riscv/rvv/vsetvl/vlmax_single_block-12.c: New test.
>  * gcc.target/riscv/rvv/vsetvl/vlmax_single_block-13.c: New test.
>  * gcc.target/riscv/rvv/vsetvl/vlmax_single_block-14.c: New test.
>  * gcc.target/riscv/rvv/vsetvl/vlmax_single_block-15.c: New test.
>  * gcc.target/riscv/rvv/vsetvl/vlmax_single_block-16.c: New test.
>  * gcc.target/riscv/rvv/vsetvl/vlmax_single_block-17.c: New test.
>  * gcc.target/riscv/rvv/vsetvl/vlmax_single_block-18.c: New test.
>  * gcc.target/riscv/rvv/vsetvl/vlmax_single_block-19.c: New test.
>  * gcc.target/riscv/rvv/vsetvl/vlmax_single_block-2.c: New test.
>  * gcc.target/riscv/rvv/vsetvl/vlmax_single_block-3.c: New test.
>  * gcc.target/riscv/rvv/vsetvl/vlmax_single_block-4.c: New test.
>  * gcc.target/riscv/rvv/vsetvl/vlmax_single_block-5.c: New test.
>  * gcc.target/riscv/rvv/vsetvl/vlmax_single_block-6.c: New test.
>  * gcc.target/riscv/rvv/vsetvl/vlmax_single_block-7.c: New test.
>  * gcc.target/riscv/rvv/vsetvl/vlmax_single_block-8.c: New test.
>  * gcc.target/riscv/rvv/vsetvl/vlmax_single_block-9.c: New test.
>  * gcc.target/riscv/rvv/vsetvl/vlmax_single_vtype-1.c: New test.
>  * gcc.target/riscv/rvv/vsetvl/vlmax_single_vtype-2.c: New test.
>  * gcc.target/riscv/rvv/vsetvl/vlmax_single_vtype-3.c: New test.
>  * gcc.target/riscv/rvv/vsetvl/vlmax_single_vtype-4.c: New test.
>  * gcc.target/riscv/rvv/vsetvl/vlmax_single_vtype-5.c: New test.
>  * gcc.target/riscv/rvv/vsetvl/vlmax_single_vtype-6.c: New test.
>  * gcc.target/riscv/rvv/vsetvl/vlmax_single_vtype-7.c: New test.
>  * gcc.target/riscv/rvv/vsetvl/vlmax_single_vtype-8.c: New test.
So it looks like the assembler strings you're searching for are highly 
specific (across all 5 testsuite patches).  How sensitive do we expect 
these tests to be to things like register allocation giving us different 
registers and such?  I'd hate to be in a position where we're constantly 
needing to update these tests because the output is changing in 
unimportant ways.
 
Jeff
 


Re: Re: [PATCH] RISC-V: Remove unit-stride store from ta attribute

2022-12-16 Thread 钟居哲
Yes, the vector stores doesn't care about policy no matter mask or tail.
Removing it can allow VSETVL PASS have more optimization chances
since VSETVL PASS has backward demands fusion.

For example:
vadd tama
vse.v
VSETVL PASS will choose to set tama for vse.v

vadd tumu
vse.v
VSETVL PASS will choose to set tumu for vse.v



juzhe.zh...@rivai.ai
 
From: Jeff Law
Date: 2022-12-17 04:01
To: juzhe.zhong; gcc-patches
CC: kito.cheng; palmer
Subject: Re: [PATCH] RISC-V: Remove unit-stride store from ta attribute
 
 
On 12/14/22 04:36, juzhe.zh...@rivai.ai wrote:
> From: Ju-Zhe Zhong 
> 
> Since store instructions doesn't care about tail policy, we remove
> vste from "ta" attribute. Hence, we could have more fusion chances
> and better optimization.
> 
> gcc/ChangeLog:
> 
>  * config/riscv/vector.md: Remove vste.
Just to confirm that I understand the basic model.  Vector stores only 
update active elements, thus they don't care about tail policy, right?
 
Assuming that's the case, then this is OK.
 
jeff
 


Re: [PATCH RFA] build: add -Wconditionally-supported to strict_warn [PR64867]

2022-12-16 Thread Jeff Law via Gcc-patches




On 12/6/22 06:26, Jason Merrill via Gcc-patches wrote:

Tested x86_64-pc-linux-gnu, OK for trunk?

-- 8< --

The PR (which isn't resolved by this commit) pointed out to me that GCC
should build with -Wconditionally-supported to support bootstrapping with a
C++11 compiler that makes different choices.

PR c++/64867

gcc/ChangeLog:

* configure.ac (strict_warn): Add -Wconditionally-supported.
* configure: Regenerate.

OK.  I wonder if it'll trip anything, particularly in the target files.

Jeff


Re: [PATCH] tree-optimization/106904 - bogus -Wstringopt-overflow with vectors

2022-12-16 Thread Jeff Law via Gcc-patches




On 12/7/22 06:54, Richard Biener via Gcc-patches wrote:

The following avoids CSE of >wp to >wp.hwnd confusing
-Wstringopt-overflow by making sure to produce addresses to the
biggest container from vectorization.  For this I introduce
strip_zero_offset_components which turns >wp.hwnd into
&(*ps) and use that to base the vector data references on.
That will also work for addresses with variable components,
alternatively emitting pointer arithmetic via calling
get_inner_reference and gimplifying that would be possible
but likely more intrusive.

This is by no means a complete fix for all of those issues
(avoiding ADDR_EXPRs in favor of pointer arithmetic might be).
Other passes will have similar issues.

In theory that might now cause false negatives.

Bootstrap and regtest running on x86_64-unknown-linux-gnu.

Any opinion?

Thanks,
Richard.

PR tree-optimization/106904
* tree.h (strip_zero_offset_components): Declare.
* tree.cc (strip_zero_offset_components): Define.
* tree-vect-data-refs.cc (vect_create_addr_base_for_vector_ref):
Strip zero offset components before building the address.

* gcc.dg/Wstringop-overflow-pr106904.c: New testcase.
So you're just canonicalizing to the widest container for a zero offset 
access.  While it may not fix everything, that seems like a good thing 
in general when we can do so.  I wouldn't be surprised if other passes 
could do the same thing to fix some of the missed CSE opportunities.


jeff


Re: [PATCH] build: doc: Obsolete Solaris 11.3 support

2022-12-16 Thread Jeff Law via Gcc-patches




On 12/13/22 05:35, Rainer Orth wrote:

This patch implements the Solaris 11.[0-3] obsoletion just announced in

https://gcc.gnu.org/pipermail/gcc/2022-December/240322.html

Bootstrapped without regressions on Solaris 11.3 (i386-pc-solaris2.11,
sparc-sun-solaris2.11 without and with --enable-obsolete) and 11.4.

Ok for trunk?

While I've been extra careful with the config.gcc part to make it work
correctly in native and cross configurations, it would be good if some
build maintainer could check.

The trouble is that config.guess doesn't include the minor version in
the triple and even if that were to change now, it's guaranteed to break
lots of code that doesn't expect this, so I'm doing the determination
locally.


OK.

jeff


Re: [PATCH] c++: empty captured var as template argument [PR107437]

2022-12-16 Thread Jason Merrill via Gcc-patches

On 12/16/22 14:03, Patrick Palka wrote:

On Fri, 16 Dec 2022, Jason Merrill wrote:


On 12/16/22 11:45, Patrick Palka wrote:

Here we're rejecting the use of the captured 't' (of empty type) as a
template argument ultimately because convert_nontype_argument checks
potentiality using is_constant_expression which returns false for
captured variables since want_rval=false.  But in this case an
lvalue-to-rvalue conversion of the argument is implied, so I believe we
should be checking is_rvalue_constant_expression.

This patch defines an rvalue version of is_nondep_const_expr and uses
it in convert_nontype_argument when the target type is a non-reference.

Bootstrapped and regtested on x86_64-pc-linux-gnu, does this look right?
Alternatively, since 'non_dep' in convert_nontype_argument controls only
whether we should call instantiate_non_dependent_expr, it seems weird to
me that we care about potentiality at all.  So I experimented with using
instantiation_dependent_expression_p here instead, and that seemed to
work well and even fixed the dg-ice'd test cpp1z/constexpr-lambda26.C.
In r12-7564-gec0f53a3a542e7 we made a similar change to decltype.


This approach sounds good to me.


Like so?  Bootstrapped and regtested on x86_64-pc-linux-gnu.


OK.


-- >8 --

Subject: [PATCH] c++: constantness of non-dependent NTTP argument [PR107437]

Here we're rejecting the use of the captured 't' (of empty type) as a
template argument ultimately because convert_nontype_argument checks
constantness using is_constant_expression which returns false for
captured variables since want_rval=false.  But in this case an
lvalue-to-rvalue conversion of the argument is implied, so I believe we
should be using is_rvalue_constant_expression instead.

However, it doesn't seem necessary to consider constantness at all
in convert_nontype_argument when determining whether to instantiate
a non-dependent argument.  So this patch gets rid of the problematic
constantness test altogether, which is sufficient to fix the testcase
(as well as the similar dg-ice'd testcase from PR87765).  Note that in
r12-7564-gec0f53a3a542e7 we made a similar change to finish_decltype_type.

PR c++/107437
PR c++/87765

gcc/cp/ChangeLog:

* pt.cc (convert_nontype_argument): Relax is_nondep_const_expr
test to !inst_dep_expr_p.

gcc/testsuite/ChangeLog:

* g++.dg/cpp1y/lambda-generic-107437.C: New test.
* g++.dg/cpp1z/constexpr-lambda26.C: Remove dg-ice.
---
  gcc/cp/pt.cc  |  2 +-
  .../g++.dg/cpp1y/lambda-generic-107437.C  | 21 +++
  .../g++.dg/cpp1z/constexpr-lambda26.C |  1 -
  3 files changed, 22 insertions(+), 2 deletions(-)
  create mode 100644 gcc/testsuite/g++.dg/cpp1y/lambda-generic-107437.C

diff --git a/gcc/cp/pt.cc b/gcc/cp/pt.cc
index bc566ab702b..2516cca590e 100644
--- a/gcc/cp/pt.cc
+++ b/gcc/cp/pt.cc
@@ -7318,7 +7318,7 @@ convert_nontype_argument (tree type, tree expr, 
tsubst_flags_t complain)
&& has_value_dependent_address (expr))
  /* If we want the address and it's value-dependent, don't fold.  */;
else if (processing_template_decl
-  && is_nondependent_constant_expression (expr))
+  && !instantiation_dependent_expression_p (expr))
  non_dep = true;
if (error_operand_p (expr))
  return error_mark_node;
diff --git a/gcc/testsuite/g++.dg/cpp1y/lambda-generic-107437.C 
b/gcc/testsuite/g++.dg/cpp1y/lambda-generic-107437.C
new file mode 100644
index 000..f9b4e0187e2
--- /dev/null
+++ b/gcc/testsuite/g++.dg/cpp1y/lambda-generic-107437.C
@@ -0,0 +1,21 @@
+// PR c++/107437
+// { dg-do compile { target c++14 } }
+
+struct integral_constant {
+  constexpr operator int() const { return 42; }
+};
+
+template
+struct A {
+  static constexpr int value = N;
+};
+
+template
+void f(T t) {
+  [=](auto) {
+A a; // { dg-bogus "constant" }
+return a.value;
+  }(0);
+}
+
+template void f(integral_constant);
diff --git a/gcc/testsuite/g++.dg/cpp1z/constexpr-lambda26.C 
b/gcc/testsuite/g++.dg/cpp1z/constexpr-lambda26.C
index 0cdb400d21c..e66cd1dee64 100644
--- a/gcc/testsuite/g++.dg/cpp1z/constexpr-lambda26.C
+++ b/gcc/testsuite/g++.dg/cpp1z/constexpr-lambda26.C
@@ -1,7 +1,6 @@
  // PR c++/87765
  // { dg-do compile { target c++17 } }
  // { dg-additional-options "-fchecking" }
-// { dg-ice "cxx_eval_constant_expression" }
  
  template 

  using foo = int;




Re: [PATCH] c-family: Fix ICE with -Wsuggest-attribute [PR98487]

2022-12-16 Thread Jason Merrill via Gcc-patches

On 12/16/22 13:28, Marek Polacek wrote:

Here we crash because check_function_format was using TREE_PURPOSE
directly rather than using get_attribute_name.

Bootstrapped/regtested on x86_64-pc-linux-gnu, ok for trunk?


OK.


PR c/98487

gcc/c-family/ChangeLog:

* c-format.cc (check_function_format): Use get_attribute_name.

gcc/testsuite/ChangeLog:

* c-c++-common/Wsuggest-attribute-1.c: New test.
---
  gcc/c-family/c-format.cc  |  2 +-
  .../c-c++-common/Wsuggest-attribute-1.c   | 36 +++
  2 files changed, 37 insertions(+), 1 deletion(-)
  create mode 100644 gcc/testsuite/c-c++-common/Wsuggest-attribute-1.c

diff --git a/gcc/c-family/c-format.cc b/gcc/c-family/c-format.cc
index 01adea4ff41..08643c5da7a 100644
--- a/gcc/c-family/c-format.cc
+++ b/gcc/c-family/c-format.cc
@@ -1215,7 +1215,7 @@ check_function_format (const_tree fn, tree attrs, int 
nargs,
  for (c = TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl));
   c;
   c = TREE_CHAIN (c))
-   if (is_attribute_p ("format", TREE_PURPOSE (c))
+   if (is_attribute_p ("format", get_attribute_name (c))
&& (decode_format_type (IDENTIFIER_POINTER
(TREE_VALUE (TREE_VALUE (c
== info.format_type))
diff --git a/gcc/testsuite/c-c++-common/Wsuggest-attribute-1.c 
b/gcc/testsuite/c-c++-common/Wsuggest-attribute-1.c
new file mode 100644
index 000..8b5b398fb78
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/Wsuggest-attribute-1.c
@@ -0,0 +1,36 @@
+/* PR c/98487 */
+/* { dg-do compile { target { c || c++11 } } } */
+/* { dg-options "-Wsuggest-attribute=format" } */
+
+#include 
+
+[[gnu::__format__(__printf__, 1, 2)]]
+void
+do_printf(const char * const a0, ...)
+{
+  va_list ap;
+  va_start(ap, a0);
+  __builtin_vprintf(a0, ap);
+  va_end(ap);
+}
+
+[[gnu::__format__(__scanf__, 1, 2)]]
+void
+do_scanf(const char * const a0, ...)
+{
+  va_list ap;
+  va_start(ap, a0);
+  __builtin_vscanf(a0, ap);
+  va_end(ap);
+}
+
+struct tm;
+
+[[gnu::__format__(__strftime__, 1, 0)]]
+void
+do_strftime(const char * const a0, struct tm * a1)
+{
+  char buff[256];
+  __builtin_strftime(buff, sizeof(buff), a0, a1);
+  __builtin_puts(buff);
+}

base-commit: b50fe16a3b2214c418ecc5febc0bb21bc17912b7




Re: [PATCH] RISC-V: Remove unit-stride store from ta attribute

2022-12-16 Thread Andrew Waterman
On Fri, Dec 16, 2022 at 1:59 PM Palmer Dabbelt  wrote:
>
> On Fri, 16 Dec 2022 12:01:04 PST (-0800), jeffreya...@gmail.com wrote:
> >
> >
> > On 12/14/22 04:36, juzhe.zh...@rivai.ai wrote:
> >> From: Ju-Zhe Zhong 
> >>
> >> Since store instructions doesn't care about tail policy, we remove
> >> vste from "ta" attribute. Hence, we could have more fusion chances
> >> and better optimization.
> >>
> >> gcc/ChangeLog:
> >>
> >>  * config/riscv/vector.md: Remove vste.
> > Just to confirm that I understand the basic model.  Vector stores only
> > update active elements, thus they don't care about tail policy, right?
> >
> > Assuming that's the case, then this is OK.
>
> That had been my assumption as well, but I don't see that explicitly
> called out in the ISA manual.  I see
>
>Masked vector stores only update active memory elements.
>
> where "active" is defined as
>
> * The _body_ elements are those whose element index is greater than or 
> equal
> to the initial value in the `vstart` register, and less than the current
> vector length setting in `vl`. The body can be split into two disjoint 
> subsets:
>
> ** The _active_ elements during a vector instruction's execution are the
> elements within the body and where the current mask is enabled at that 
> element
> position.  The active elements can raise exceptions and update the 
> destination
> vector register group.
>
> but I don't see anything about the unmasked stores.  The blurb about
> tail elements only applies to registers groups, not memory addresses, so
> I think that's kind of a grey area there too.  I was pretty sure the intent
> here was to have tail elements not updated in memory, so hopefully I'm just
> missing something in the spec.

As discussed on the github issue, I think there is sufficient
justification in the spec to say that vector stores are forbidden from
accessing tail elements even if unmasked.  (And of course the ISA
would be pretty useless if that weren't the case...) But there's no
reason not to clarify the language in the spec, so as to make it
easier for readers to grok.

>
> I open an issue to check: https://github.com/riscv/riscv-v-spec/issues/846


Re: [PATCH] RISC-V: Remove unit-stride store from ta attribute

2022-12-16 Thread Palmer Dabbelt

On Fri, 16 Dec 2022 12:01:04 PST (-0800), jeffreya...@gmail.com wrote:



On 12/14/22 04:36, juzhe.zh...@rivai.ai wrote:

From: Ju-Zhe Zhong 

Since store instructions doesn't care about tail policy, we remove
vste from "ta" attribute. Hence, we could have more fusion chances
and better optimization.

gcc/ChangeLog:

 * config/riscv/vector.md: Remove vste.

Just to confirm that I understand the basic model.  Vector stores only
update active elements, thus they don't care about tail policy, right?

Assuming that's the case, then this is OK.


That had been my assumption as well, but I don't see that explicitly 
called out in the ISA manual.  I see


  Masked vector stores only update active memory elements.

where "active" is defined as

   * The _body_ elements are those whose element index is greater than or equal
   to the initial value in the `vstart` register, and less than the current
   vector length setting in `vl`. The body can be split into two disjoint 
subsets:

   ** The _active_ elements during a vector instruction's execution are the
   elements within the body and where the current mask is enabled at that 
element
   position.  The active elements can raise exceptions and update the 
destination
   vector register group.

but I don't see anything about the unmasked stores.  The blurb about 
tail elements only applies to registers groups, not memory addresses, so 
I think that's kind of a grey area there too.  I was pretty sure the intent

here was to have tail elements not updated in memory, so hopefully I'm just
missing something in the spec.

I open an issue to check: https://github.com/riscv/riscv-v-spec/issues/846


[committed] Suppress warning from -fstack-protector on hppa

2022-12-16 Thread John David Anglin
Committed to trunk.

Dave
---

Suppress -fstack-protector warning on hppa.

Some package builds enable -fstack-protector and -Werror. Since
-fstack-protector is not supported on hppa because the stack grows
up, these packages must check for the warning generated by
-fstack-protector and suppress it on hppa. This is problematic
since hppa is the only significant architecture where the stack
grows up.

2022-12-16  John David Anglin  

gcc/ChangeLog:

* config/pa/pa.cc (pa_option_override): Disable -fstack-protector.

gcc/testsuite/ChangeLog:

* lib/target-supports.exp (check_effective_target_static): Return 0
on hppa*-*-*.

diff --git a/gcc/config/pa/pa.cc b/gcc/config/pa/pa.cc
index 54ab486a02d..9f43802075f 100644
--- a/gcc/config/pa/pa.cc
+++ b/gcc/config/pa/pa.cc
@@ -567,6 +567,9 @@ pa_option_override (void)
   flag_reorder_blocks = 1;
 }
 
+  /* Disable -fstack-protector to suppress warning.  */
+  flag_stack_protect = 0;
+
   /* We can't guarantee that .dword is available for 32-bit targets.  */
   if (UNITS_PER_WORD == 4)
 targetm.asm_out.aligned_op.di = NULL;
diff --git a/gcc/testsuite/lib/target-supports.exp 
b/gcc/testsuite/lib/target-supports.exp
index 2a058c67c53..0ed20bf9e45 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -1214,6 +1214,9 @@ proc check_effective_target_static {} {
 
 # Return 1 if the target supports -fstack-protector
 proc check_effective_target_fstack_protector {} {
+if { [istarget hppa*-*-*] } {
+   return 0;
+}
 return [check_runtime fstack_protector {
#include 
int main (int argc, char *argv[]) {



signature.asc
Description: PGP signature


Re: [PATCH 2/3] Make __float128 use the _Float128 type, PR target/107299

2022-12-16 Thread Michael Meissner via Gcc-patches
On Fri, Dec 16, 2022 at 11:55:27AM -0600, Segher Boessenkool wrote:
> Hi!
> 
> On Thu, Dec 15, 2022 at 07:09:38PM -0500, Michael Meissner wrote:
> > On Thu, Dec 15, 2022 at 11:59:49AM -0600, Segher Boessenkool wrote:
> > > On Wed, Dec 14, 2022 at 10:36:03AM +0100, Jakub Jelinek wrote:
> > > > The hacks with different precisions of powerpc 128-bit floating types 
> > > > are
> > > > very unfortunate, it is I assume because the middle-end asserted that 
> > > > scalar
> > > > floating point types with different modes have different precision.
> > > 
> > > IEEE QP and double-double cannot be ordered, neither represents a subset
> > > of the other.  But the current middle end does require a total ordering
> > > for all floating point types (that can be converted to each other).
> > > 
> > > Mike's precision hack was supposed to give us some time until an actual
> > > fix was made.  But no one has worked on that, and there have been
> > > failures found with the precision hack as well, it worked remarkably
> > > well but it isn't perfect.
> > > 
> > > We cannot move forward in a meaningful way until these problems are
> > > fixed.  We can move around like headless chickens some more of course.
> > 
> > In general I tend to think most of these automatic widenings are
> > problematical.  But there are cases where it makes sense.
> 
> These things are *not* widening at all, that is the problem.  For some
> values it is lossy, in either direction.

Ummm yes and no.

Going from SFmode to DFmode is a widening, as is SDmode to DDmode.  Since all
values within SFmode or SDmode can be represented in DFmode/DDmode.  That is
needed, since not all machines have full support for arithmetic.

Going from DFmode to KFmode is still a widening, but in general we may want to
prevent it from happing due to the speed of KFmode operations compared to
DFmode.  Likewise going from DFmode to IFmode is a widening since all values in
DFmode can be represented in IFmode.

Obviously going from IFmode to KFmode or the reverse is where the issue it.

> > Lets see.  On the PowerPC, there is no support for 32-bit decimal 
> > arithmetic.
> > There you definately the compiler to automatically promoto SDmode to DDmode 
> > to
> > do the arithmetic and then possibly convert it back.  Similarly for the 
> > limited
> > 16-bit floating point modes, where you have operations to pack and unpack 
> > the
> > object, but you have no arithmetic.
> 
> And those things *are* widening, non-lossy in all cases.  Well-defined
> etc.

Yes, but we just need to improve the hooks to prevent cases where it is not
defined.

> > But I would argue that you NEVER want to automatically promoto DFmode to 
> > either
> > KFmode, TFmode, or IFmode, since those modes are (almost) always going to be
> > slower than doing the emulation.  This is particularly true if we only 
> > support
> > a subset of operations, where some things can be done inline, but a lot of
> > operations would need to be done via emulation (such as on power8 where we
> > don't have IEEE 128-bit support in hardware).
> 
> TFmode is either the same actual mode as either KFmode or IFmode, let's
> reduce confusion by not talking about it at all anymore.
> 
> The middle end should never convert to another mode without a good
> reason for it.  But OTOH, both IFmode and KFmode can represent all
> values in DFmode, you just have to be careful about semantics when
> eventually converting back.

It doesn't.  Where the issue is is when you call a built-in function and that
built-in function uses different types than what you pass.  Then conversions
have to be inserted.

> > If the machine independent part of the compiler decides oh we can do this
> > operation because some operations are present (such as move, negate, 
> > absolute
> > value, and compare), then you likely will wind up promoting the 64-bit 
> > type(s)
> > to 128-bit, doing a call to a slower 128-bit function, and then truncating 
> > the
> > value back to 64-bit is faster than calling a 64-bit emulation function.
> 
> This does not in general have the correct semantics though (without
> -ffast-math), so the compiler will not do things like it.

It would happen if we didn't set the hooks to prevent it, but we did.  Maybe
there are places that need more hooks.

> > While for the PowerPC, we want to control what is the logical wider type for
> > floating point types, I can imagine we don't want all backends to have to
> > implment these hooks if they just have the standard 2-3 floating point 
> > modes.
> 
> KFmode is not wider than IFmode.  IFmode is not wider than KFmode.
> KFmode can represent values IFmode cannot.  IFmode can represent valuse
> KFmode cannot.

There the issue is the historical issue that GCC believes there is only number
(precision) that says whether one type is wider than another.  And to some
extent, precision is wrong, in that do you want precision to mean the number of
bytes in a value, the mantissa size, the exponent 

[PATCH] libsanitizer: Add __interceptor_sigsetjmp_internal

2022-12-16 Thread H.J. Lu via Gcc-patches
Add an internal alias to __interceptor_sigsetjmp to avoid R_X86_64_PC32
relocation for "jmp __interceptor_sigsetjmp" with old assemblers.

PR sanitizer/108106
* hwasan/hwasan_setjmp_x86_64.S (__interceptor_sigsetjmp): Add
an internal alias, __interceptor_sigsetjmp_internal.
---
 libsanitizer/hwasan/hwasan_setjmp_x86_64.S | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/libsanitizer/hwasan/hwasan_setjmp_x86_64.S 
b/libsanitizer/hwasan/hwasan_setjmp_x86_64.S
index 7566c1ea0a5..071dcdcf613 100644
--- a/libsanitizer/hwasan/hwasan_setjmp_x86_64.S
+++ b/libsanitizer/hwasan/hwasan_setjmp_x86_64.S
@@ -37,13 +37,14 @@ __interceptor_setjmp:
   CFI_STARTPROC
   _CET_ENDBR
   xorl %esi, %esi
-  jmp  __interceptor_sigsetjmp
+  jmp  __interceptor_sigsetjmp_internal
   CFI_ENDPROC
 ASM_SIZE(__interceptor_setjmp)
 
 .global __interceptor_sigsetjmp
 ASM_TYPE_FUNCTION(__interceptor_sigsetjmp)
 __interceptor_sigsetjmp:
+__interceptor_sigsetjmp_internal:
   CFI_STARTPROC
   _CET_ENDBR
 
-- 
2.38.1



Re: Add '-Wno-complain-wrong-lang', and use it in 'gcc/testsuite/lib/target-supports.exp:check_compile' and elsewhere (was: Make '-frust-incomplete-and-experimental-compiler-do-not-use' a 'Common' opt

2022-12-16 Thread Iain Buclaw via Gcc-patches
Excerpts from Thomas Schwinge's message of Dezember 16, 2022 3:10 pm:
> 
> In the test suites, a number of existing test cases explicitly match the
> "command-line option [...] is valid for [...] but not for [...]"
> diagnostic with 'dg-warning'; I've left those alone.  On the other hand,
> I've changed 'dg-prune-output' of this diagnostic into
> '-Wno-complain-wrong-lang' usage.  I'm happy to adjust that in either way
> anyone may prefer.  I've not looked for test cases that just to silence
> this diagnostic use more general 'dg-prune-output', 'dg-excess-errors',
> '-w', etc.
> 
> In the GCC/D test suite, I see a number of:
> 
> cc1plus: warning: command-line option '-fpreview=in' is valid for D but 
> not for C++
> 
> cc1plus: warning: command-line option '-fextern-std=c++11' is valid for D 
> but not for C++
> 
> It's not clear to me how they, despite this, do achieve
> 'PASS: [...] (test for excess errors)'?  Maybe I haven't found where that
> gets pruned/ignored?
> 

There's an implicit dg-prune-output inserted by the gdc-convert-test
proc. As the only tests that mix C++ and D sources is the runnable_cxx
part of the testsuite, I can add the flag to the D2 testsuite scripts
later just for those tests.

Iain.


Re: [PATCH V4 1/2] rs6000: use li;x?oris to build constant

2022-12-16 Thread Segher Boessenkool
Hi!

On Mon, Dec 12, 2022 at 09:38:28AM +0800, Jiufu Guo wrote:
>   PR target/106708
> 
> gcc/ChangeLog:
> 
>   * config/rs6000/rs6000.cc (rs6000_emit_set_long_const): Add using
>   "li; x?oris" to build constant.
> 
> gcc/testsuite/ChangeLog:
> 
>   * gcc.target/powerpc/pr106708.c: New test.

Okay for trunk with the nits Ke Wen pointed out taken care off.

Thanks!


Segher


Re: [PATCH V4 1/2] rs6000: use li;x?oris to build constant

2022-12-16 Thread Segher Boessenkool
On Wed, Dec 14, 2022 at 06:27:57PM +0800, Kewen.Lin wrote:
> > +void __attribute__ ((__noipa__)) lixoris (long long *arg)
> 
> Nit: Adding separator "_" to make the name like "li_xoris" or even
> "test_li_xoris" seems better to read.  Also applied for the other
> function names "lioris" and "lisrldicl".

Ha yes, that last one is a bit impregnable like this.  It is testsuite
so everything goes of course, but :-)


Segher


[committed] libstdc++: Add monadic operations to std::expected for C++23 (P2505R5)

2022-12-16 Thread Jonathan Wakely via Gcc-patches
Tested x86_64-linux. Pushed to trunk.

-- >8 --

This was approved for C++23 last month in Kona.

libstdc++-v3/ChangeLog:

* include/std/expected (expected): Add monadic operations.
(expected): Likewise.
* include/std/version (__cpp_lib_expected): Bump value.
* testsuite/20_util/expected/synopsis.cc: Adjust expected macro
value.
* testsuite/20_util/expected/version.cc: Likewise.
* testsuite/20_util/expected/illformed_neg.cc: Prune additional
errors from ill-formed monadic operations.
* testsuite/20_util/expected/observers.cc: Check error_or.
* testsuite/20_util/expected/monadic.cc: New test.
---
 libstdc++-v3/include/std/expected | 579 +-
 libstdc++-v3/include/std/version  |   2 +-
 .../20_util/expected/illformed_neg.cc |   1 +
 .../testsuite/20_util/expected/monadic.cc | 280 +
 .../testsuite/20_util/expected/observers.cc   |  20 +
 .../testsuite/20_util/expected/synopsis.cc|   2 +-
 .../testsuite/20_util/expected/version.cc |   2 +-
 7 files changed, 882 insertions(+), 4 deletions(-)
 create mode 100644 libstdc++-v3/testsuite/20_util/expected/monadic.cc

diff --git a/libstdc++-v3/include/std/expected 
b/libstdc++-v3/include/std/expected
index 2fe25a90d2d..555779581f2 100644
--- a/libstdc++-v3/include/std/expected
+++ b/libstdc++-v3/include/std/expected
@@ -35,6 +35,7 @@
 
 #include 
 #include // exception
+#include// __invoke
 #include // construct_at
 #include   // in_place_t
 
@@ -49,7 +50,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
* @{
*/
 
-#define __cpp_lib_expected 202202L
+#define __cpp_lib_expected 202211L
 
   /// Discriminated union that holds an expected value or an error value.
   /**
@@ -151,11 +152,20 @@ namespace __expected
   template
 constexpr bool __is_unexpected> = true;
 
+  template
+using __result = remove_cvref_t>;
+  template
+using __result0 = remove_cvref_t>;
+
   template
 concept __can_be_unexpected
   = is_object_v<_Er> && (!is_array_v<_Er>)
  && (!__expected::__is_unexpected<_Er>)
  && (!is_const_v<_Er>) && (!is_volatile_v<_Er>);
+
+  // Tag types for in-place construction from an invocation result.
+  struct __in_place_inv { };
+  struct __unexpect_inv { };
 }
 /// @endcond
 
@@ -334,6 +344,14 @@ namespace __expected
   __not_>
  >;
 
+  template
+   static constexpr bool __same_val
+ = is_same_v;
+
+  template
+   static constexpr bool __same_err
+ = is_same_v;
+
 public:
   using value_type = _Tp;
   using error_type = _Er;
@@ -791,6 +809,274 @@ namespace __expected
  return static_cast<_Tp>(std::forward<_Up>(__v));
}
 
+  template
+   constexpr _Er
+   error_or(_Gr&& __e) const&
+   {
+ static_assert( is_copy_constructible_v<_Er> );
+ static_assert( is_convertible_v<_Gr, _Er> );
+
+ if (_M_has_value)
+   return std::forward<_Gr>(__e);
+ return _M_unex;
+   }
+
+  template
+   constexpr _Er
+   error_or(_Gr&& __e) &&
+   {
+ static_assert( is_move_constructible_v<_Er> );
+ static_assert( is_convertible_v<_Gr, _Er> );
+
+ if (_M_has_value)
+   return std::forward<_Gr>(__e);
+ return std::move(_M_unex);
+   }
+
+  // monadic operations
+
+  template requires is_copy_constructible_v<_Er>
+   constexpr auto
+   and_then(_Fn&& __f) &
+   {
+ using _Up = __expected::__result<_Fn, _Tp&>;
+ static_assert(__expected::__is_expected<_Up>);
+ static_assert(is_same_v);
+
+ if (has_value())
+   return std::__invoke(std::forward<_Fn>(__f), value());
+ else
+   return _Up(unexpect, error());
+   }
+
+  template requires is_copy_constructible_v<_Er>
+   constexpr auto
+   and_then(_Fn&& __f) const &
+   {
+ using _Up = __expected::__result<_Fn, const _Tp&>;
+ static_assert(__expected::__is_expected<_Up>);
+ static_assert(is_same_v);
+
+ if (has_value())
+   return std::__invoke(std::forward<_Fn>(__f), value());
+ else
+   return _Up(unexpect, error());
+   }
+
+  template requires is_move_constructible_v<_Er>
+   constexpr auto
+   and_then(_Fn&& __f) &&
+   {
+ using _Up = __expected::__result<_Fn, _Tp&&>;
+ static_assert(__expected::__is_expected<_Up>);
+ static_assert(is_same_v);
+
+ if (has_value())
+   return std::__invoke(std::forward<_Fn>(__f), std::move(value()));
+ else
+   return _Up(unexpect, std::move(error()));
+   }
+
+
+  template requires is_move_constructible_v<_Er>
+   constexpr auto
+   and_then(_Fn&& __f) const &&
+   {
+ using _Up = __expected::__result<_Fn, const _Tp&&>;
+ 

[committed] libstdc++: Fixes for std::expected

2022-12-16 Thread Jonathan Wakely via Gcc-patches
Tested x86_64-linux. Pushed to trunk. I'll backport to gcc-12 too.

-- >8 --

This fixes some bugs in the swap functions for std::expected.

It also disables the noexcept-specifiers for equality operators, because
those are problematic when querying whether a std::expected is equality
comparable. The operator==(const expected&, const U&) function is
not constrained, so is viable for comparing expected with
expected, but then we get an error from the noexcept-specifier.

libstdc++-v3/ChangeLog:

* include/std/expected (expected::_M_swap_val_unex): Guard the
correct object.
(expected::swap): Move is_swappable
requirement from static_assert to constraint.
(swap): Likewise.
(operator==): Remove noexcept-specifier.
* testsuite/20_util/expected/swap.cc: Check swapping of
types without non-throwing move constructor. Check constraints
on swap.
* testsuite/20_util/expected/unexpected.cc: Check constraints on
swap.
* testsuite/20_util/expected/equality.cc: New test.
---
 libstdc++-v3/include/std/expected | 21 ++---
 .../testsuite/20_util/expected/equality.cc| 49 ++
 .../testsuite/20_util/expected/swap.cc| 92 ++-
 .../testsuite/20_util/expected/unexpected.cc  |  4 +
 4 files changed, 152 insertions(+), 14 deletions(-)
 create mode 100644 libstdc++-v3/testsuite/20_util/expected/equality.cc

diff --git a/libstdc++-v3/include/std/expected 
b/libstdc++-v3/include/std/expected
index e491ce41591..2fe25a90d2d 100644
--- a/libstdc++-v3/include/std/expected
+++ b/libstdc++-v3/include/std/expected
@@ -217,8 +217,8 @@ namespace __expected
 
   constexpr void
   swap(unexpected& __other) noexcept(is_nothrow_swappable_v<_Er>)
+  requires is_swappable_v<_Er>
   {
-   static_assert( is_swappable_v<_Er> );
using std::swap;
swap(_M_unex, __other._M_unex);
   }
@@ -230,9 +230,8 @@ namespace __expected
{ return __x._M_unex == __y.error(); }
 
   friend constexpr void
-  swap(unexpected& __x, unexpected& __y)
-  noexcept(noexcept(__x.swap(__y)))
-  requires requires {__x.swap(__y);}
+  swap(unexpected& __x, unexpected& __y) noexcept(noexcept(__x.swap(__y)))
+  requires is_swappable_v<_Er>
   { __x.swap(__y); }
 
 private:
@@ -798,8 +797,8 @@ namespace __expected
requires (!is_void_v<_Up>)
friend constexpr bool
operator==(const expected& __x, const expected<_Up, _Er2>& __y)
-   noexcept(noexcept(bool(*__x == *__y))
- && noexcept(bool(__x.error() == __y.error(
+   // FIXME: noexcept(noexcept(bool(*__x == *__y))
+ // && noexcept(bool(__x.error() == __y.error(
{
  if (__x.has_value())
return __y.has_value() && bool(*__x == *__y);
@@ -810,13 +809,13 @@ namespace __expected
   template
friend constexpr bool
operator==(const expected& __x, const _Up& __v)
-   noexcept(noexcept(bool(*__x == __v)))
+   // FIXME: noexcept(noexcept(bool(*__x == __v)))
{ return __x.has_value() && bool(*__x == __v); }
 
   template
friend constexpr bool
operator==(const expected& __x, const unexpected<_Er2>& __e)
-   noexcept(noexcept(bool(__x.error() == __e.error(
+   // FIXME: noexcept(noexcept(bool(__x.error() == __e.error(
{ return !__x.has_value() && bool(__x.error() == __e.error()); }
 
   friend constexpr void
@@ -878,7 +877,7 @@ namespace __expected
  }
else
  {
-   __expected::_Guard<_Tp> __guard(__rhs._M_val);
+   __expected::_Guard<_Tp> __guard(_M_val);
std::construct_at(__builtin_addressof(_M_unex),
  std::move(__rhs._M_unex)); // might throw
_M_has_value = false;
@@ -1187,7 +1186,7 @@ namespace __expected
requires is_void_v<_Up>
friend constexpr bool
operator==(const expected& __x, const expected<_Up, _Er2>& __y)
-   noexcept(noexcept(bool(__x.error() == __y.error(
+   // FIXME: noexcept(noexcept(bool(__x.error() == __y.error(
{
  if (__x.has_value())
return __y.has_value();
@@ -1198,7 +1197,7 @@ namespace __expected
   template
friend constexpr bool
operator==(const expected& __x, const unexpected<_Er2>& __e)
-   noexcept(noexcept(bool(__x.error() == __e.error(
+   // FIXME: noexcept(noexcept(bool(__x.error() == __e.error(
{ return !__x.has_value() && bool(__x.error() == __e.error()); }
 
   friend constexpr void
diff --git a/libstdc++-v3/testsuite/20_util/expected/equality.cc 
b/libstdc++-v3/testsuite/20_util/expected/equality.cc
new file mode 100644
index 000..1862719e73d
--- /dev/null
+++ b/libstdc++-v3/testsuite/20_util/expected/equality.cc
@@ -0,0 +1,49 @@
+// { dg-options "-std=gnu++23" }
+// { dg-do compile { target c++23 } }
+

[committed] libstdc++: Diagnose broken allocator rebind members

2022-12-16 Thread Jonathan Wakely via Gcc-patches
Tested x86_64-linux. Pushed to trunk.

-- >8 --

This adds a static assertion to std::allocator_traits::rebind_alloc to
diagnose violations of the rule that rebinding an allocator to its own
value type yields the same allocator type.

This helps to catch the easy mistake of deriving from std::allocator but
forgetting to override the rebind behaviour (no longer an issue in C++20
as std::allocator doesn't have a rebind member that can be inherited).
It also catches bugs like in 23_containers/vector/52591.cc where a typo
means the rebound allocator is a completely different type.

I initially wanted to put this static assert into the body of
allocator_traits:

  static_assert(is_same, _Alloc>::value,
"rebind_alloc must be Alloc");

However, this causes a regression in the test for PR libstdc++/72792.
It seems that instantiating std::allocator_traits should be allowed for
invalid allocator types as long as you don't try to rebind them. To
support that, only assert in the __allocator_traits_base::__rebind class
template (in both the primary template and the partial specialization).
As a result, the bug in 20_util/scoped_allocator/outermost.cc is not
diagnosed, because nothing in that test rebinds the allocator.

libstdc++-v3/ChangeLog:

* include/bits/alloc_traits.h (__allocator_traits_base::__rebind):
Add static assert for rebind requirement.
* testsuite/20_util/allocator_traits/members/rebind_alloc.cc:
Fix invalid rebind member in test allocator.
* testsuite/20_util/allocator_traits/requirements/rebind_neg.cc:
New test.
* testsuite/20_util/scoped_allocator/outermost.cc: Add rebind to
test allocator.
* testsuite/23_containers/forward_list/48101_neg.cc: Prune new
static assert error.
* testsuite/23_containers/unordered_multiset/48101_neg.cc:
Likewise.
* testsuite/23_containers/unordered_set/48101_neg.cc:
Likewise.
* testsuite/23_containers/vector/52591.cc: Fix typo in rebind.
---
 libstdc++-v3/include/bits/alloc_traits.h  | 17 ++--
 .../allocator_traits/members/rebind_alloc.cc  | 11 +-
 .../requirements/rebind_neg.cc| 20 +++
 .../20_util/scoped_allocator/outermost.cc |  8 
 .../23_containers/forward_list/48101_neg.cc   |  1 +
 .../unordered_multiset/48101_neg.cc   |  1 +
 .../23_containers/unordered_set/48101_neg.cc  |  1 +
 .../testsuite/23_containers/vector/52591.cc   |  2 +-
 8 files changed, 52 insertions(+), 9 deletions(-)
 create mode 100644 
libstdc++-v3/testsuite/20_util/allocator_traits/requirements/rebind_neg.cc

diff --git a/libstdc++-v3/include/bits/alloc_traits.h 
b/libstdc++-v3/include/bits/alloc_traits.h
index 203988ab933..6eae409ab53 100644
--- a/libstdc++-v3/include/bits/alloc_traits.h
+++ b/libstdc++-v3/include/bits/alloc_traits.h
@@ -51,12 +51,25 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   struct __allocator_traits_base
   {
 template
-  struct __rebind : __replace_first_arg<_Tp, _Up> { };
+  struct __rebind : __replace_first_arg<_Tp, _Up>
+  {
+   static_assert(is_same<
+ typename __replace_first_arg<_Tp, typename _Tp::value_type>::type,
+   _Tp>::value,
+ "allocator_traits::rebind_alloc must be A");
+  };
 
 template
   struct __rebind<_Tp, _Up,
  __void_t::other>>
-  { using type = typename _Tp::template rebind<_Up>::other; };
+  {
+   using type = typename _Tp::template rebind<_Up>::other;
+
+   static_assert(is_same<
+ typename _Tp::template rebind::other,
+   _Tp>::value,
+ "allocator_traits::rebind_alloc must be A");
+  };
 
   protected:
 template
diff --git 
a/libstdc++-v3/testsuite/20_util/allocator_traits/members/rebind_alloc.cc 
b/libstdc++-v3/testsuite/20_util/allocator_traits/members/rebind_alloc.cc
index ca2a8044665..dc2b1afa2d5 100644
--- a/libstdc++-v3/testsuite/20_util/allocator_traits/members/rebind_alloc.cc
+++ b/libstdc++-v3/testsuite/20_util/allocator_traits/members/rebind_alloc.cc
@@ -24,17 +24,16 @@ using std::is_same;
 template
   using Rebind = typename std::allocator_traits::template rebind_alloc;
 
-#if __STDC_HOSTED__
-template
+template
   struct HasRebind {
 using value_type = T;
-template struct rebind { using other = std::allocator; };
+template struct rebind { using other = HasRebind; };
   };
 
-static_assert(is_same, long>,
- std::allocator>::value,
+// Would get HasRebind here if the first template argument is
+// replaced instead of using the nested rebind.
+static_assert(is_same, long>, HasRebind>::value,
  "nested alias template is used");
-#endif
 
 template
   struct NoRebind0 {
diff --git 
a/libstdc++-v3/testsuite/20_util/allocator_traits/requirements/rebind_neg.cc 
b/libstdc++-v3/testsuite/20_util/allocator_traits/requirements/rebind_neg.cc
new file 

Re: [PATCH V6] rs6000: Optimize cmp on rotated 16bits constant

2022-12-16 Thread Segher Boessenkool
On Wed, Dec 14, 2022 at 04:26:54PM +0800, Jiufu Guo wrote:
> Segher Boessenkool  writes:
> > On Mon, Aug 29, 2022 at 11:42:16AM +0800, Jiufu Guo wrote:
> >> li %r9,-1
> >> rldicr %r9,%r9,0,0
> >> cmpd %cr0,%r3,%r9
> >
> > FWIW, I find the winnt assembler syntax very hard to read, and I doubt
> > I am the only one.
> Oh, sorry about that.  I will avoid to add '-mregnames' to dump asm. :)
> BTW, what options are you used to dump asm code? 

The same as GCC outputs, and as I write assembler code as: bare numbers.
It is much easier to type, and very much easier to read.

-mregnames is fine for output (and it is the default as well), but
problematic for input.  Take for example
li r10,r10
which translates to
li 10,10
while what was probably wanted is to load the address of the global
symbol r10, which can be written as
li r10,(r10)

I do understand that liking the bare numbers syntax is an acquired taste
of course.  But less clutter is very useful.  This goes hand in hand
with writing multiple asm statements per line, which allows you to group
things together nicely:
li 9,-1 ; rldicr 9,9,0,0 ; cmpd 3,9

> > Maybe it is better to not return magic values anyway?  So perhaps
> >
> > bool
> > can_be_done_as_compare_of_rotate (unsigned HOST_WIDE_INT c, int clz, int 
> > *rot)
> >
> > (with *rot written if the return value is true).
> Thanks for your suggestion!
> It is checking if a constant can be rotated from/to a value which has
> only few tailing nonzero bits (all leading bits are zeros). 
> 
> So, I'm thinking to name the function as something like:
> can_be_rotated_to_lowbits.

That is a good name yeah.

> >> +bool
> >> +compare_rotate_immediate_p (unsigned HOST_WIDE_INT c)
> >
> > No _p please, this function is not a predicate (at least, the name does
> > not say what it tests).  So a better name please.  This matters even
> > more for extern functions (like this one) because the function
> > implementation is always farther away so you do not easily have all
> > interface details in mind.  Good names help :-)
> Thanks! Name is always a matter. :)
> 
> Maybe we can name this funciton as "can_be_rotated_as_compare_operand",
> or "is_constant_rotateable_for_compare", because this function checks
> "if a constant can be rotated to/from an immediate operand of
> cmpdi/cmpldi". 

Maybe just "constant_can_be_rotated_to_lowbits"?  (If that is what the
function does).  It doesn't clearly say that it allows negative numbers
as well, but that is a problem of the function itself already; maybe it
would be better to do signed and unsigned separately.

> >> +(define_code_iterator eqne [eq ne])
> >> +(define_code_attr EQNE [(eq "EQ") (ne "NE")])
> >
> > Just  or  should work?
> Great! Thanks for point out this!  works.
> >
> > Please fix these things.  Almost there :-)
> 
> I updated the patch as below. Bootstraping and regtesting is ongoing.
> Thanks again for your careful and insight review!

Please send as new message (not as reply even), that is much easier to
handle.  Thanks!


Segher


Re: [PATCH] RISC-V: Fix RVV mask mode size

2022-12-16 Thread Jeff Law via Gcc-patches




On 12/13/22 23:48, juzhe.zh...@rivai.ai wrote:

From: Ju-Zhe Zhong 

This patch is to fix RVV mask modes size. Since mask mode size are adjust
as a whole RVV register size LMUL = 1 which not only make each mask type for
example vbool32_t tied to vint8m1_t but also increase memory consuming.

I notice this issue during development of VSETVL PASS. Since it is not part of
VSETVL support, I seperate it into a single fix patch now.

gcc/ChangeLog:

 * config/riscv/riscv-modes.def (ADJUST_BYTESIZE): Reduce RVV mask mode 
size.
 * config/riscv/riscv.cc (riscv_v_adjust_bytesize): New function.
 (riscv_modes_tieable_p): Don't tie mask modes which will create issue.
 * config/riscv/riscv.h (riscv_v_adjust_bytesize): New function.
So I haven't really studied the masking model for RVV (yet).  But 
there's two models that I'm generally aware of.


One model has a bit per element in the vector we're operating on.  So a 
V4DF will have 4 bits in the mask.  I generally call this the dense or 
packed model.


The other model has a bit for every element for the maximal number of 
elements that can ever appear in a vector.  So if we support an element 
length of 8bits and a 1kbit vector, then the sparse model would have 128 
bits regardless of the size of the object being operated on.  So we'd 
still have 128 bits for V4DF, but the vast majority would be don't cares.



ISTM that you're trying to set the mode size to the smallest possible 
which would seem to argue that you want the dense/packed mask model. 
Does that actually match what the hardware does?  If not, then don't we 
need to convert back and forth?



Or maybe I'm missing something here?!?

Jeff



Re: [PATCH] RISC-V: Add testcases for VSETVL PASS

2022-12-16 Thread Jeff Law via Gcc-patches




On 12/14/22 01:09, juzhe.zh...@rivai.ai wrote:

From: Ju-Zhe Zhong 

gcc/testsuite/ChangeLog:

 * gcc.target/riscv/rvv/rvv.exp: Adjust to enable tests for VSETVL PASS.
 * gcc.target/riscv/rvv/vsetvl/dump-1.c: New test.
 * gcc.target/riscv/rvv/vsetvl/vlmax_single_block-1.c: New test.
 * gcc.target/riscv/rvv/vsetvl/vlmax_single_block-10.c: New test.
 * gcc.target/riscv/rvv/vsetvl/vlmax_single_block-11.c: New test.
 * gcc.target/riscv/rvv/vsetvl/vlmax_single_block-12.c: New test.
 * gcc.target/riscv/rvv/vsetvl/vlmax_single_block-13.c: New test.
 * gcc.target/riscv/rvv/vsetvl/vlmax_single_block-14.c: New test.
 * gcc.target/riscv/rvv/vsetvl/vlmax_single_block-15.c: New test.
 * gcc.target/riscv/rvv/vsetvl/vlmax_single_block-16.c: New test.
 * gcc.target/riscv/rvv/vsetvl/vlmax_single_block-17.c: New test.
 * gcc.target/riscv/rvv/vsetvl/vlmax_single_block-18.c: New test.
 * gcc.target/riscv/rvv/vsetvl/vlmax_single_block-19.c: New test.
 * gcc.target/riscv/rvv/vsetvl/vlmax_single_block-2.c: New test.
 * gcc.target/riscv/rvv/vsetvl/vlmax_single_block-3.c: New test.
 * gcc.target/riscv/rvv/vsetvl/vlmax_single_block-4.c: New test.
 * gcc.target/riscv/rvv/vsetvl/vlmax_single_block-5.c: New test.
 * gcc.target/riscv/rvv/vsetvl/vlmax_single_block-6.c: New test.
 * gcc.target/riscv/rvv/vsetvl/vlmax_single_block-7.c: New test.
 * gcc.target/riscv/rvv/vsetvl/vlmax_single_block-8.c: New test.
 * gcc.target/riscv/rvv/vsetvl/vlmax_single_block-9.c: New test.
 * gcc.target/riscv/rvv/vsetvl/vlmax_single_vtype-1.c: New test.
 * gcc.target/riscv/rvv/vsetvl/vlmax_single_vtype-2.c: New test.
 * gcc.target/riscv/rvv/vsetvl/vlmax_single_vtype-3.c: New test.
 * gcc.target/riscv/rvv/vsetvl/vlmax_single_vtype-4.c: New test.
 * gcc.target/riscv/rvv/vsetvl/vlmax_single_vtype-5.c: New test.
 * gcc.target/riscv/rvv/vsetvl/vlmax_single_vtype-6.c: New test.
 * gcc.target/riscv/rvv/vsetvl/vlmax_single_vtype-7.c: New test.
 * gcc.target/riscv/rvv/vsetvl/vlmax_single_vtype-8.c: New test.
So it looks like the assembler strings you're searching for are highly 
specific (across all 5 testsuite patches).  How sensitive do we expect 
these tests to be to things like register allocation giving us different 
registers and such?  I'd hate to be in a position where we're constantly 
needing to update these tests because the output is changing in 
unimportant ways.


Jeff


Re: [PATCH] RISC-V: Remove unit-stride store from ta attribute

2022-12-16 Thread Jeff Law via Gcc-patches




On 12/14/22 04:36, juzhe.zh...@rivai.ai wrote:

From: Ju-Zhe Zhong 

Since store instructions doesn't care about tail policy, we remove
vste from "ta" attribute. Hence, we could have more fusion chances
and better optimization.

gcc/ChangeLog:

 * config/riscv/vector.md: Remove vste.
Just to confirm that I understand the basic model.  Vector stores only 
update active elements, thus they don't care about tail policy, right?


Assuming that's the case, then this is OK.

jeff


[committed] analyzer: add src_region param to region_model::check_for_poison [PR106479]

2022-12-16 Thread David Malcolm via Gcc-patches
PR analyzer/106479 notes that we don't always show the region-creation
event for a memmove from an uninitialized stack region.  This occurs
when using kf_memcpy_memmove.  Fix by passing a src_region hint to
region_model::check_for_poison.

Successfully bootstrapped & regrtested on x86_64-pc-linux-gnu.
Pushed to trunk as r13-4750-g2fdc8546b5c6cb.

gcc/analyzer/ChangeLog:
PR analyzer/106479
* kf.cc (kf_memcpy_memmove::impl_call_pre): Pass in source region
to region_model::check_for_poison.
* region-model-asm.cc (region_model::on_asm_stmt): Pass NULL
region to region_model::check_for_poison.
* region-model.cc (region_model::check_for_poison): Add
"src_region" param, and pass it to poisoned_value_diagnostic.
(region_model::on_assignment): Pass NULL region to
region_model::check_for_poison.
(region_model::get_rvalue): Likewise.
* region-model.h (region_model::check_for_poison): Add
"src_region" param.
* sm-fd.cc (fd_state_machine::on_accept): Pass in source region
to region_model::check_for_poison.
* varargs.cc (kf_va_copy::impl_call_pre): Pass NULL region to
region_model::check_for_poison.
(kf_va_arg::impl_call_pre): Pass in source region to
region_model::check_for_poison.

gcc/testsuite/ChangeLog:
PR analyzer/106479
* gcc.dg/analyzer/pr104308.c (test_memmove_within_uninit): Remove
xfail on region creation event.

Signed-off-by: David Malcolm 
---
 gcc/analyzer/kf.cc   |  2 +-
 gcc/analyzer/region-model-asm.cc |  2 +-
 gcc/analyzer/region-model.cc | 11 ++-
 gcc/analyzer/region-model.h  |  1 +
 gcc/analyzer/sm-fd.cc|  1 +
 gcc/analyzer/varargs.cc  |  3 ++-
 gcc/testsuite/gcc.dg/analyzer/pr104308.c |  2 +-
 7 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/gcc/analyzer/kf.cc b/gcc/analyzer/kf.cc
index ff2f1b1ef9c..6088bfc72c0 100644
--- a/gcc/analyzer/kf.cc
+++ b/gcc/analyzer/kf.cc
@@ -288,7 +288,7 @@ kf_memcpy_memmove::impl_call_pre (const call_details ) 
const
   const svalue *src_contents_sval
 = model->get_store_value (sized_src_reg, cd.get_ctxt ());
   model->check_for_poison (src_contents_sval, cd.get_arg_tree (1),
-  cd.get_ctxt ());
+  sized_src_reg, cd.get_ctxt ());
   model->set_value (sized_dest_reg, src_contents_sval, cd.get_ctxt ());
 }
 
diff --git a/gcc/analyzer/region-model-asm.cc b/gcc/analyzer/region-model-asm.cc
index 171b2496f58..ac32c6f06f7 100644
--- a/gcc/analyzer/region-model-asm.cc
+++ b/gcc/analyzer/region-model-asm.cc
@@ -226,7 +226,7 @@ region_model::on_asm_stmt (const gasm *stmt, 
region_model_context *ctxt)
 
   tree src_expr = input_tvec[i];
   const svalue *src_sval = get_rvalue (src_expr, ctxt);
-  check_for_poison (src_sval, src_expr, ctxt);
+  check_for_poison (src_sval, src_expr, NULL, ctxt);
   input_svals.quick_push (src_sval);
   reachable_regs.handle_sval (src_sval);
 
diff --git a/gcc/analyzer/region-model.cc b/gcc/analyzer/region-model.cc
index f6cd34f4c22..55064400a08 100644
--- a/gcc/analyzer/region-model.cc
+++ b/gcc/analyzer/region-model.cc
@@ -1004,11 +1004,13 @@ due_to_ifn_deferred_init_p (const gassign *assign_stmt)
 
 /* Check for SVAL being poisoned, adding a warning to CTXT.
Return SVAL, or, if a warning is added, another value, to avoid
-   repeatedly complaining about the same poisoned value in followup code.  */
+   repeatedly complaining about the same poisoned value in followup code.
+   SRC_REGION is a hint about where SVAL came from, and can be NULL.  */
 
 const svalue *
 region_model::check_for_poison (const svalue *sval,
tree expr,
+   const region *src_region,
region_model_context *ctxt) const
 {
   if (!ctxt)
@@ -1046,8 +1048,7 @@ region_model::check_for_poison (const svalue *sval,
 the tree other than via the def stmts, using
 fixup_tree_for_diagnostic.  */
   tree diag_arg = fixup_tree_for_diagnostic (expr);
-  const region *src_region = NULL;
-  if (pkind == POISON_KIND_UNINIT)
+  if (src_region == NULL && pkind == POISON_KIND_UNINIT)
src_region = get_region_for_poisoned_expr (expr);
   if (ctxt->warn (make_unique (diag_arg,
  pkind,
@@ -1100,7 +1101,7 @@ region_model::on_assignment (const gassign *assign, 
region_model_context *ctxt)
   if (const svalue *sval = get_gassign_result (assign, ctxt))
 {
   tree expr = get_diagnostic_tree_for_gassign (assign);
-  check_for_poison (sval, expr, ctxt);
+  check_for_poison (sval, expr, NULL, ctxt);
   set_value (lhs_reg, sval, ctxt);
   return;
 }
@@ -2227,7 +2228,7 @@ region_model::get_rvalue (path_var pv, 
region_model_context 

Re: [PATCH] RISC-V: Remove unused redundant vector attributes

2022-12-16 Thread Jeff Law via Gcc-patches




On 12/14/22 01:51, juzhe.zh...@rivai.ai wrote:

From: Ju-Zhe Zhong 

I found that I forgot to remove these redundant attributes.
Sorry about that.

gcc/ChangeLog:

 * config/riscv/vector.md (): Remove redundant attributes.

OK.
jeff



Re: [PATCH] RISC-V: Fix annotation

2022-12-16 Thread Jeff Law via Gcc-patches




On 12/14/22 01:39, juzhe.zh...@rivai.ai wrote:

From: Ju-Zhe Zhong 

gcc/ChangeLog:

 * config/riscv/riscv-vsetvl.cc: Fix annotation.

Just roll this into the patch that adds riscv-vsetvl.cc.

jeff


Re: [PATCH] RISC-V: Change vlmul printing rule

2022-12-16 Thread Jeff Law via Gcc-patches




On 12/13/22 23:57, juzhe.zh...@rivai.ai wrote:

From: Ju-Zhe Zhong 

This patch is preparing patch for the following patch (VSETVL PASS)
support since the current vlmul printing rule is not appropriate
information for VSETVL PASS. I split this fix in a single patch.

gcc/ChangeLog:

 * config/riscv/riscv-v.cc (emit_vlmax_vsetvl): Pass through VLMUL enum 
instead of machine mode.
 * config/riscv/riscv-vector-builtins-bases.cc: Ditto.
 * config/riscv/riscv.cc (riscv_print_operand): Print LMUL by enum 
vlmul instead of machine mode.
OK.  Though I suspect your ChangeLog will need to be wrapped to get past 
the commit hooks.


jeff


Re: [PATCH] hwasan: Add libhwasan_preinit.o

2022-12-16 Thread Jeff Law via Gcc-patches




On 12/16/22 03:27, Jakub Jelinek via Gcc-patches wrote:

Hi!

I've noticed an inconsistency with the other sanitizers.
For -fsanitize={address,thread,leak} we link into binaries
lib*san_preinit.o such that the -lasan, -ltsan or -llsan libraries
are initialized as early as possible through .preinit_array.
The hwasan library has the same thing, but we strangely compiled
it into the library (where it apparently didn't do anything,
.preinit_array doesn't seem to be created for shared libraries),
rather than installing it like in the other 3 cases.

The following patch handles it for hwasan similarly to asan, tsan and lsan.

I don't have any hw with hwasan support, so I've just checked it
builds and installs as expected and that
gcc -fsanitize=hwaddress -o a a.c -mlam=u57
on trivial main results in .preinit_array section in the binary.

Ok for trunk?

2022-12-16  Jakub Jelinek  

* config/gnu-user.h (LIBHWASAN_EARLY_SPEC): Add libhwasan_preinit.o
to link spec if not -shared.

* hwasan/Makefile.am (nodist_toolexeclib_HEADERS): Set to
libhwasan_preinit.o.
(hwasan_files): Remove hwasan_preinit.cpp.
(libhwasan_preinit.o): Copy from hwasan_preinit.o.
* hwasan/Makefile.in: Regenerated.
No objections to the technical bits.  I trust your judgement WRT 
risk/reward for including in gcc-13.  So your call on whether to commit 
now or wait.


jeff


Re: Adding a new thread model to GCC

2022-12-16 Thread Eric Botcazou via Gcc-patches
> The libgcc parts look reasonable to me, but I can't approve them.
> Maybe Jonathan Yong can approve those parts as mingw-w64 target
> maintainer, or maybe a libgcc approver can do so.

OK.

> The libstdc++ parts are OK for trunk. IIUC they could go in
> separately, they just wouldn't be very much use without the libgcc
> changes.

Sure thing.

-- 
Eric Botcazou




Re: [PATCH] c++: empty captured var as template argument [PR107437]

2022-12-16 Thread Patrick Palka via Gcc-patches
On Fri, 16 Dec 2022, Jason Merrill wrote:

> On 12/16/22 11:45, Patrick Palka wrote:
> > Here we're rejecting the use of the captured 't' (of empty type) as a
> > template argument ultimately because convert_nontype_argument checks
> > potentiality using is_constant_expression which returns false for
> > captured variables since want_rval=false.  But in this case an
> > lvalue-to-rvalue conversion of the argument is implied, so I believe we
> > should be checking is_rvalue_constant_expression.
> > 
> > This patch defines an rvalue version of is_nondep_const_expr and uses
> > it in convert_nontype_argument when the target type is a non-reference.
> > 
> > Bootstrapped and regtested on x86_64-pc-linux-gnu, does this look right?
> > Alternatively, since 'non_dep' in convert_nontype_argument controls only
> > whether we should call instantiate_non_dependent_expr, it seems weird to
> > me that we care about potentiality at all.  So I experimented with using
> > instantiation_dependent_expression_p here instead, and that seemed to
> > work well and even fixed the dg-ice'd test cpp1z/constexpr-lambda26.C.
> > In r12-7564-gec0f53a3a542e7 we made a similar change to decltype.
> 
> This approach sounds good to me.

Like so?  Bootstrapped and regtested on x86_64-pc-linux-gnu.

-- >8 --

Subject: [PATCH] c++: constantness of non-dependent NTTP argument [PR107437]

Here we're rejecting the use of the captured 't' (of empty type) as a
template argument ultimately because convert_nontype_argument checks
constantness using is_constant_expression which returns false for
captured variables since want_rval=false.  But in this case an
lvalue-to-rvalue conversion of the argument is implied, so I believe we
should be using is_rvalue_constant_expression instead.

However, it doesn't seem necessary to consider constantness at all
in convert_nontype_argument when determining whether to instantiate
a non-dependent argument.  So this patch gets rid of the problematic
constantness test altogether, which is sufficient to fix the testcase
(as well as the similar dg-ice'd testcase from PR87765).  Note that in
r12-7564-gec0f53a3a542e7 we made a similar change to finish_decltype_type.

PR c++/107437
PR c++/87765

gcc/cp/ChangeLog:

* pt.cc (convert_nontype_argument): Relax is_nondep_const_expr
test to !inst_dep_expr_p.

gcc/testsuite/ChangeLog:

* g++.dg/cpp1y/lambda-generic-107437.C: New test.
* g++.dg/cpp1z/constexpr-lambda26.C: Remove dg-ice.
---
 gcc/cp/pt.cc  |  2 +-
 .../g++.dg/cpp1y/lambda-generic-107437.C  | 21 +++
 .../g++.dg/cpp1z/constexpr-lambda26.C |  1 -
 3 files changed, 22 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/g++.dg/cpp1y/lambda-generic-107437.C

diff --git a/gcc/cp/pt.cc b/gcc/cp/pt.cc
index bc566ab702b..2516cca590e 100644
--- a/gcc/cp/pt.cc
+++ b/gcc/cp/pt.cc
@@ -7318,7 +7318,7 @@ convert_nontype_argument (tree type, tree expr, 
tsubst_flags_t complain)
   && has_value_dependent_address (expr))
 /* If we want the address and it's value-dependent, don't fold.  */;
   else if (processing_template_decl
-  && is_nondependent_constant_expression (expr))
+  && !instantiation_dependent_expression_p (expr))
 non_dep = true;
   if (error_operand_p (expr))
 return error_mark_node;
diff --git a/gcc/testsuite/g++.dg/cpp1y/lambda-generic-107437.C 
b/gcc/testsuite/g++.dg/cpp1y/lambda-generic-107437.C
new file mode 100644
index 000..f9b4e0187e2
--- /dev/null
+++ b/gcc/testsuite/g++.dg/cpp1y/lambda-generic-107437.C
@@ -0,0 +1,21 @@
+// PR c++/107437
+// { dg-do compile { target c++14 } }
+
+struct integral_constant {
+  constexpr operator int() const { return 42; }
+};
+
+template
+struct A {
+  static constexpr int value = N;
+};
+
+template
+void f(T t) {
+  [=](auto) {
+A a; // { dg-bogus "constant" }
+return a.value;
+  }(0);
+}
+
+template void f(integral_constant);
diff --git a/gcc/testsuite/g++.dg/cpp1z/constexpr-lambda26.C 
b/gcc/testsuite/g++.dg/cpp1z/constexpr-lambda26.C
index 0cdb400d21c..e66cd1dee64 100644
--- a/gcc/testsuite/g++.dg/cpp1z/constexpr-lambda26.C
+++ b/gcc/testsuite/g++.dg/cpp1z/constexpr-lambda26.C
@@ -1,7 +1,6 @@
 // PR c++/87765
 // { dg-do compile { target c++17 } }
 // { dg-additional-options "-fchecking" }
-// { dg-ice "cxx_eval_constant_expression" }
 
 template 
 using foo = int;
-- 
2.39.0.56.g57e2c6ebbe



[PATCH] c-family: Fix ICE with -Wsuggest-attribute [PR98487]

2022-12-16 Thread Marek Polacek via Gcc-patches
Here we crash because check_function_format was using TREE_PURPOSE
directly rather than using get_attribute_name.

Bootstrapped/regtested on x86_64-pc-linux-gnu, ok for trunk?

PR c/98487

gcc/c-family/ChangeLog:

* c-format.cc (check_function_format): Use get_attribute_name.

gcc/testsuite/ChangeLog:

* c-c++-common/Wsuggest-attribute-1.c: New test.
---
 gcc/c-family/c-format.cc  |  2 +-
 .../c-c++-common/Wsuggest-attribute-1.c   | 36 +++
 2 files changed, 37 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/c-c++-common/Wsuggest-attribute-1.c

diff --git a/gcc/c-family/c-format.cc b/gcc/c-family/c-format.cc
index 01adea4ff41..08643c5da7a 100644
--- a/gcc/c-family/c-format.cc
+++ b/gcc/c-family/c-format.cc
@@ -1215,7 +1215,7 @@ check_function_format (const_tree fn, tree attrs, int 
nargs,
  for (c = TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl));
   c;
   c = TREE_CHAIN (c))
-   if (is_attribute_p ("format", TREE_PURPOSE (c))
+   if (is_attribute_p ("format", get_attribute_name (c))
&& (decode_format_type (IDENTIFIER_POINTER
(TREE_VALUE (TREE_VALUE (c
== info.format_type))
diff --git a/gcc/testsuite/c-c++-common/Wsuggest-attribute-1.c 
b/gcc/testsuite/c-c++-common/Wsuggest-attribute-1.c
new file mode 100644
index 000..8b5b398fb78
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/Wsuggest-attribute-1.c
@@ -0,0 +1,36 @@
+/* PR c/98487 */
+/* { dg-do compile { target { c || c++11 } } } */
+/* { dg-options "-Wsuggest-attribute=format" } */
+
+#include 
+
+[[gnu::__format__(__printf__, 1, 2)]]
+void
+do_printf(const char * const a0, ...)
+{
+  va_list ap;
+  va_start(ap, a0);
+  __builtin_vprintf(a0, ap);
+  va_end(ap);
+}
+
+[[gnu::__format__(__scanf__, 1, 2)]]
+void
+do_scanf(const char * const a0, ...)
+{
+  va_list ap;
+  va_start(ap, a0);
+  __builtin_vscanf(a0, ap);
+  va_end(ap);
+}
+
+struct tm;
+
+[[gnu::__format__(__strftime__, 1, 0)]]
+void
+do_strftime(const char * const a0, struct tm * a1)
+{
+  char buff[256];
+  __builtin_strftime(buff, sizeof(buff), a0, a1);
+  __builtin_puts(buff);
+}

base-commit: b50fe16a3b2214c418ecc5febc0bb21bc17912b7
-- 
2.38.1



Re: [PATCH] speed up end_fde_sort using radix sort

2022-12-16 Thread Jason Merrill via Gcc-patches

On 11/22/22 03:00, Thomas Neumann wrote:

When registering a dynamic unwinding frame the fde list is sorted.
Previously, we split the list into a sorted and an unsorted part,
sorted the later using heap sort, and merged both. That can be
quite slow due to the large number of (expensive) comparisons.

This patch replaces that logic with a radix sort instead. The
radix sort uses the same amount of memory as the old logic,
using the second list as auxiliary space, and it includes two
techniques to speed up sorting: First, it computes the pointer
addresses for blocks of values, reducing the decoding overhead.
And it recognizes when the data has reached a sorted state,
allowing for early termination. When running out of memory
we fall back to pure heap sort, as before.

For this test program

#include 
int main(int argc, char** argv) {
  return 0;
}

compiled with g++ -O -o hello -static hello.c we get with
perf stat -r 200 on a 5950X the following performance numbers:

old logic:

   0,20 msec task-clock
    930.834  cycles
  3.079.765  instructions
     0,00030478 +- 0,0237 seconds time elapsed

new logic:

   0,10 msec task-clock
    473.269  cycles
  1.239.077  instructions
     0,00021119 +- 0,0168 seconds time elapsed

libgcc/ChangeLog:
     * unwind-dw2-fde.c: Use radix sort instead of split+sort+merge.
---
  libgcc/unwind-dw2-fde.c | 234 +++-
  1 file changed, 134 insertions(+), 100 deletions(-)

diff --git a/libgcc/unwind-dw2-fde.c b/libgcc/unwind-dw2-fde.c
index 3c0cc654ec0..b81540c41a4 100644
--- a/libgcc/unwind-dw2-fde.c
+++ b/libgcc/unwind-dw2-fde.c
@@ -456,22 +456,52 @@ fde_mixed_encoding_compare (struct object *ob, 
const fde *x, const fde *y)


  typedef int (*fde_compare_t) (struct object *, const fde *, const fde *);

+// The extractor functions compute the pointer values for a block of
+// fdes. The block processing hides the call overhead.

-/* This is a special mix of insertion sort and heap sort, optimized for
-   the data sets that actually occur. They look like
-   101 102 103 127 128 105 108 110 190 111 115 119 125 160 126 129 130.
-   I.e. a linearly increasing sequence (coming from functions in the text
-   section), with additionally a few unordered elements (coming from 
functions
-   in gnu_linkonce sections) whose values are higher than the values in 
the

-   surrounding linear sequence (but not necessarily higher than the values
-   at the end of the linear sequence!).
-   The worst-case total run time is O(N) + O(n log (n)), where N is the
-   total number of FDEs and n is the number of erratic ones.  */
+static void
+fde_unencoded_extract (struct object *ob __attribute__ ((unused)),
+   _Unwind_Ptr *target, const fde **x, int count)
+{
+  for (int index = 0; index < count; ++index)
+    memcpy (target + index, x[index]->pc_begin, sizeof (_Unwind_Ptr));
+}
+
+static void
+fde_single_encoding_extract (struct object *ob, _Unwind_Ptr *target,
+ const fde **x, int count)
+{
+  _Unwind_Ptr base;
+
+  base = base_from_object (ob->s.b.encoding, ob);
+  for (int index = 0; index < count; ++index)
+    read_encoded_value_with_base (ob->s.b.encoding, base, 
x[index]->pc_begin,

+  target + index);
+}
+
+static void
+fde_mixed_encoding_extract (struct object *ob, _Unwind_Ptr *target,
+    const fde **x, int count)
+{
+  for (int index = 0; index < count; ++index)
+    {
+  int encoding = get_fde_encoding (x[index]);
+  read_encoded_value_with_base (encoding, base_from_object 
(encoding, ob),

+    x[index]->pc_begin, target + index);
+    }
+}
+
+typedef void (*fde_extractor_t) (struct object *, _Unwind_Ptr *, const 
fde **,

+ int);
+
+// Data is is sorted using radix sort if possible, using an temporary
+// auxiliary data structure of the same size as the input. When running
+// out of memory to in-place heap sort.


s/to/do/ ?

OK with that change or other fix to that sentence.


  struct fde_accumulator
  {
    struct fde_vector *linear;
-  struct fde_vector *erratic;
+  struct fde_vector *aux;
  };

  static inline int
@@ -485,8 +515,8 @@ start_fde_sort (struct fde_accumulator *accu, size_t 
count)

    if ((accu->linear = malloc (size)))
  {
    accu->linear->count = 0;
-  if ((accu->erratic = malloc (size)))
-    accu->erratic->count = 0;
+  if ((accu->aux = malloc (size)))
+    accu->aux->count = 0;
    return 1;
  }
    else
@@ -500,59 +530,6 @@ fde_insert (struct fde_accumulator *accu, const fde 
*this_fde)

  accu->linear->array[accu->linear->count++] = this_fde;
  }

-/* Split LINEAR into a linear sequence with low values and an erratic
-   sequence with high values, put the linear one (of longest possible
-   length) into LINEAR and the erratic one into ERRATIC. This is O(N).
-
-   Because the longest linear sequence we 

Re: [PATCH 2/3] Make __float128 use the _Float128 type, PR target/107299

2022-12-16 Thread Segher Boessenkool
Hi!

On Thu, Dec 15, 2022 at 07:09:38PM -0500, Michael Meissner wrote:
> On Thu, Dec 15, 2022 at 11:59:49AM -0600, Segher Boessenkool wrote:
> > On Wed, Dec 14, 2022 at 10:36:03AM +0100, Jakub Jelinek wrote:
> > > The hacks with different precisions of powerpc 128-bit floating types are
> > > very unfortunate, it is I assume because the middle-end asserted that 
> > > scalar
> > > floating point types with different modes have different precision.
> > 
> > IEEE QP and double-double cannot be ordered, neither represents a subset
> > of the other.  But the current middle end does require a total ordering
> > for all floating point types (that can be converted to each other).
> > 
> > Mike's precision hack was supposed to give us some time until an actual
> > fix was made.  But no one has worked on that, and there have been
> > failures found with the precision hack as well, it worked remarkably
> > well but it isn't perfect.
> > 
> > We cannot move forward in a meaningful way until these problems are
> > fixed.  We can move around like headless chickens some more of course.
> 
> In general I tend to think most of these automatic widenings are
> problematical.  But there are cases where it makes sense.

These things are *not* widening at all, that is the problem.  For some
values it is lossy, in either direction.

> Lets see.  On the PowerPC, there is no support for 32-bit decimal arithmetic.
> There you definately the compiler to automatically promoto SDmode to DDmode to
> do the arithmetic and then possibly convert it back.  Similarly for the 
> limited
> 16-bit floating point modes, where you have operations to pack and unpack the
> object, but you have no arithmetic.

And those things *are* widening, non-lossy in all cases.  Well-defined
etc.

> But I would argue that you NEVER want to automatically promoto DFmode to 
> either
> KFmode, TFmode, or IFmode, since those modes are (almost) always going to be
> slower than doing the emulation.  This is particularly true if we only support
> a subset of operations, where some things can be done inline, but a lot of
> operations would need to be done via emulation (such as on power8 where we
> don't have IEEE 128-bit support in hardware).

TFmode is either the same actual mode as either KFmode or IFmode, let's
reduce confusion by not talking about it at all anymore.

The middle end should never convert to another mode without a good
reason for it.  But OTOH, both IFmode and KFmode can represent all
values in DFmode, you just have to be careful about semantics when
eventually converting back.

> If the machine independent part of the compiler decides oh we can do this
> operation because some operations are present (such as move, negate, absolute
> value, and compare), then you likely will wind up promoting the 64-bit type(s)
> to 128-bit, doing a call to a slower 128-bit function, and then truncating the
> value back to 64-bit is faster than calling a 64-bit emulation function.

This does not in general have the correct semantics though (without
-ffast-math), so the compiler will not do things like it.

> While for the PowerPC, we want to control what is the logical wider type for
> floating point types, I can imagine we don't want all backends to have to
> implment these hooks if they just have the standard 2-3 floating point modes.

KFmode is not wider than IFmode.  IFmode is not wider than KFmode.
KFmode can represent values IFmode cannot.  IFmode can represent valuse
KFmode cannot.

> I purposelly haven't been looking into 16-bit floating point support, but I
> have to imagine there you have the problem that there are at least 2-3
> different 16-bit formats roaming around.  This is essentially the same issue 
> in
> the PowerPC where you have 2 128-bit floating point types, neither of which is
> a pure subset of the other.

There are at least six 16-bit float formats in actual use.  But we care
mostly about IEEE binary16 and about bfloat16, each with or without
non-normal values (and the encodings for those reused for extra normal
numbers, perhaps).

> To my way of thinking, it is a many branching tree.  On the PowerPC, you want
> SDmode to promoto to DDmode,

But it is the backend that should be in control there, not generic code.
And that is the way things already work!

> and possibly to TDmode.  And SFmode mode would
> promote to DFmode,

In general we do not, it would not have the correct semantics.

Anyway, to get us back on track a bit:

There should not be any non-explicit conversions from KFmode to IFmode
or vice versa.  Each of those modes can represent values the other can
not.  There is no ordering between those modes in that sense, we should
not pretend there is one.


Segher


Re: Ping---[V3][PATCH 2/2] Add a new warning option -Wstrict-flex-arrays.

2022-12-16 Thread Qing Zhao via Gcc-patches
FYI.

Committed this last patch as:
https://jira.oci.oraclecorp.com/browse/OLDIS-21095

I will come up with the update to gcc-13/changes.html for -fstrict-flex-arrays 
very soon.

Thanks.

Qing
> On Dec 16, 2022, at 9:49 AM, Qing Zhao via Gcc-patches 
>  wrote:
> 
> 
> 
>> On Dec 16, 2022, at 4:17 AM, Richard Biener  wrote:
>> 
>> On Thu, 15 Dec 2022, Qing Zhao wrote:
>> 
>>> 
>>> 
 On Dec 15, 2022, at 2:47 AM, Richard Biener  wrote:
 
 On Wed, 14 Dec 2022, Qing Zhao wrote:
 
> Hi, Richard,
> 
> I guess that we now agreed on the following:
> 
> “ the information that we ran into a trailing array but didn't consider 
> it a flex array because of -fstrict-flex-arrays is always a useful 
> information”
> 
> The only thing we didn’t decide is:
> 
> A. Amend such new information to -Warray-bounds when 
> -fstrict-flex-arrays=N (N>0) specified.
> 
> OR
> 
> B. Issue such new information with a new warning option 
> -Wstrict-flex-arrays when -fstrict-flex-arrays=N (N>0) specified.
> 
> My current patch implemented B. 
 
 Plus it implements it to specify a different flex-array variant for
 the extra diagnostic.
>>> Could you clarify a little bit on this? (Don’t quite understand…)
 
> If you think A is better, I will change the patch as A. 
 
 I would tend to A since, as I said, it's useful information that
 shouldn't be hidden and not adding an option removes odd combination
 possibilities such as -Wno-array-bounds -Wstrict-flex-arrays.
>>> 
>>> With current implementation, the above combination will ONLY report the 
>>> misuse of trailing array as flex-array.  No out-of-bounds warnings 
>>> issued.
>>> 
 In particular I find, say, -fstrict-flex-arrays=2 -Wstrict-flex-arrays=1
 hardly useful.
>>> 
>>> The above combination will NOT happen, because there is NO level argument 
>>> for -Wstrict-flex-arrays.
>>> 
>>> The only combination will be:-fstrict-flex-arrays=N -Wstrict-flex-arrays
>>> 
>>> When N > 0, -Wstrict-flex-arrays will report any misuse of trailing arrays 
>>> as flexible array per the value of N.
 
 But I'm interested in other opinions.
>>> 
>>> Adding a separate -Wstrict-flex-arrays will provide users a choice to ONLY 
>>> look at the mis-use of trailing arrays as flex-arrays.  Without this new 
>>> option, such information will be buried into tons of out-of-bounds messges. 
>>> 
>>> I think this is the major benefit to have one separate new warning 
>>> -Wstrict-flex-arrays. 
>>> 
>>> Do we need to provide the users this choice?
>> 
>> Ah, OK - I can see the value of auditing code this way before
>> enabling -fstrict-flex-arrays.
> Yes, I think the major benefit of this option is to help users to identify 
> all the places where the trailing arrays are misused as flex-arrays at 
> different level of -fstrict-flex-arrays=N, then update their source code 
> accordingly. And finally can enable -fstrict-flex-arrays by default.
>> 
>>> +  if (opts->x_warn_strict_flex_arrays)
>>> +if (opts->x_flag_strict_flex_arrays == 0)
>>> +  {
>>> + opts->x_warn_strict_flex_arrays = 0;
>>> + warning_at (UNKNOWN_LOCATION, 0,
>>> + "%<-Wstrict-flex-arrays%> is ignored when"
>>> + " %<-fstrict-flex-arrays%> does not present");
>> 
>> "is not present”.
> Okay.
>> 
>> The patch is OK with that change.
> Thanks! Will commit the patch after the change.
>> 
>> Thanks and sorry for the slow process ...
> 
> Thank you for your patience and questions.
> The discussion is very helpful since I was not 100% sure whether this new 
> warning is necessary or not in the beginning, but now after this discussion I 
> feel confident that it’s a necessary option to be added.
> 
> Qing
> 
>> 
>> Richard.
>> 
>>> Thanks.
>>> 
>>> Qing
 
 Thanks,
 Richard.
 
> Let me know your opinion.
> 
> thanks.
> 
> Qing
> 
> 
>> On Dec 14, 2022, at 4:03 AM, Richard Biener  wrote:
>> 
>> On Tue, 13 Dec 2022, Qing Zhao wrote:
>> 
>>> Richard, 
>>> 
>>> Do you have any decision on this one? 
>>> Do we need this warning option For GCC? 
>> 
>> Looking at the testcases it seems that the diagnostic amends
>> -Warray-bounds diagnostics for trailing but not flexible arrays?
>> Wouldn't it be better to generally diagnose this, so have
>> -Warray-bounds, with -fstrict-flex-arrays, for
>> 
>> struct X { int a[1]; };
>> int foo (struct X *p)
>> {
>> return p->a[1];
>> }
>> 
>> emit
>> 
>> warning: array subscript 1 is above array bounds ...
>> note: the trailing array is only a flexible array member with 
>> -fno-strict-flex-arrays
>> 
>> ?  Having -Wstrict-flex-arrays=N and N not agree with the
>> -fstrict-flex-arrays level sounds hardly useful to me but the
>> information that we ran into a trailing array but 

Re: [PATCH] initialize fde objects lazily

2022-12-16 Thread Jason Merrill via Gcc-patches

On 12/9/22 12:34, Thomas Neumann wrote:

When registering an unwind frame with __register_frame_info_bases
we currently initialize that fde object eagerly. This has the
advantage that it is immutable afterwards and we can safely
access it from multiple threads, but it has the disadvantage
that we pay the initialization cost even if the application
never throws an exception.

This commit changes the logic to initialize the objects lazily.
The objects themselves are inserted into the b-tree when
registering the frame, but the sorted fde_vector is
not constructed yet. Only on the first time that an
exception tries to pass through the registered code the
object is initialized. We notice that with a double checking,
first doing a relaxed load of the sorted bit and then re-checking
under a mutex when the object was not initialized yet.

Note that the check must implicitly be safe concering a concurrent
frame deregistration, as trying the deregister a frame that is
on the unwinding path of a concurrent exception is inherently racy.


OK, thanks.


libgcc/ChangeLog:
     * unwind-dw2-fde.c: Initialize fde object lazily when
     the first exception tries to pass through.
---
  libgcc/unwind-dw2-fde.c | 52 -
  1 file changed, 41 insertions(+), 11 deletions(-)

diff --git a/libgcc/unwind-dw2-fde.c b/libgcc/unwind-dw2-fde.c
index 3c0cc654ec0..6f69c20ff4b 100644
--- a/libgcc/unwind-dw2-fde.c
+++ b/libgcc/unwind-dw2-fde.c
@@ -63,8 +63,6 @@ release_registered_frames (void)

  static void
  get_pc_range (const struct object *ob, uintptr_type *range);
-static void
-init_object (struct object *ob);

  #else
  /* Without fast path frame deregistration must always succeed.  */
@@ -76,6 +74,7 @@ static const int in_shutdown = 0;
     by decreasing value of pc_begin.  */
  static struct object *unseen_objects;
  static struct object *seen_objects;
+#endif

  #ifdef __GTHREAD_MUTEX_INIT
  static __gthread_mutex_t object_mutex = __GTHREAD_MUTEX_INIT;
@@ -103,7 +102,6 @@ init_object_mutex_once (void)
  static __gthread_mutex_t object_mutex;
  #endif
  #endif
-#endif

  /* Called from crtbegin.o to register the unwind info for an object.  */

@@ -126,10 +124,7 @@ __register_frame_info_bases (const void *begin, 
struct object *ob,

  #endif

  #ifdef ATOMIC_FDE_FAST_PATH
-  // Initialize eagerly to avoid locking later
-  init_object (ob);
-
-  // And register the frame
+  // Register the frame in the b-tree
    uintptr_type range[2];
    get_pc_range (ob, range);
    btree_insert (_frames, range[0], range[1] - range[0], ob);
@@ -180,10 +175,7 @@ __register_frame_info_table_bases (void *begin, 
struct object *ob,

    ob->s.b.encoding = DW_EH_PE_omit;

  #ifdef ATOMIC_FDE_FAST_PATH
-  // Initialize eagerly to avoid locking later
-  init_object (ob);
-
-  // And register the frame
+  // Register the frame in the b-tree
    uintptr_type range[2];
    get_pc_range (ob, range);
    btree_insert (_frames, range[0], range[1] - range[0], ob);
@@ -892,7 +884,15 @@ init_object (struct object* ob)
    accu.linear->orig_data = ob->u.single;
    ob->u.sort = accu.linear;

+#ifdef ATOMIC_FDE_FAST_PATH
+  // We must update the sorted bit with an atomic operation
+  struct object tmp;
+  tmp.s.b = ob->s.b;
+  tmp.s.b.sorted = 1;
+  __atomic_store (&(ob->s.b), &(tmp.s.b), __ATOMIC_SEQ_CST);
+#else
    ob->s.b.sorted = 1;
+#endif
  }

  #ifdef ATOMIC_FDE_FAST_PATH
@@ -1130,6 +1130,21 @@ search_object (struct object* ob, void *pc)
  }
  }

+#ifdef ATOMIC_FDE_FAST_PATH
+
+// Check if the object was already initialized
+static inline bool
+is_object_initialized (struct object *ob)
+{
+  // We have to use relaxed atomics for the read, which
+  // is a bit involved as we read from a bitfield
+  struct object tmp;
+  __atomic_load (&(ob->s.b), &(tmp.s.b), __ATOMIC_RELAXED);
+  return tmp.s.b.sorted;
+}
+
+#endif
+
  const fde *
  _Unwind_Find_FDE (void *pc, struct dwarf_eh_bases *bases)
  {
@@ -1141,6 +1156,21 @@ _Unwind_Find_FDE (void *pc, struct dwarf_eh_bases 
*bases)

    if (!ob)
  return NULL;

+  // Initialize the object lazily
+  if (!is_object_initialized (ob))
+    {
+  // Check again under mutex
+  init_object_mutex_once ();
+  __gthread_mutex_lock (_mutex);
+
+  if (!ob->s.b.sorted)
+    {
+  init_object (ob);
+    }
+
+  __gthread_mutex_unlock (_mutex);
+    }
+
    f = search_object (ob, pc);
  #else





Re: [PATCH v2] RISC-V: Produce better code with complex constants [PR95632] [PR106602]

2022-12-16 Thread Jeff Law via Gcc-patches




On 12/9/22 11:25, Raphael Moreira Zinsly wrote:

Changes since v1:
- Fixed formatting issues.
- Added a name to the define_insn_and_split pattern.
- Set the target on the 'dg-do compile' in pr106602.c.
- Removed the rv32 restriction in pr95632.c.

-- >8 --

Due to RISC-V limitations on operations with big constants combine
is failing to match such operations and is not being able to
produce optimal code as it keeps splitting them.  By pretending we
can do those operations we can get more opportunities for
simplification of surrounding instructions.

2022-12-06  Raphael Moreira Zinsly  
Jeff Law  

gcc/Changelog:
PR target/95632
PR target/106602
* config/riscv/riscv.md: New pattern to simulate complex
const_int loads.

gcc/testsuite/ChangeLog:
* gcc.target/riscv/pr95632.c: New test.
* gcc.target/riscv/pr106602.c: New test.
So I was doing a bit of testing around this.  I think we need we're 
going to need a v3.


The problem is at -O1 several passes do not run.  In particular the 
post-loop CSE pass isn't run.  That causes cse_not_expected to be set 
earlier in the pipeline which in turn means the new pattern is exposed 
to fwprop -- too early IMHO and with more potential to cause minor 
regressions as can be seen with riscv/load-immediate.c


Given that bridge patterns are fairly standard and that combining, then 
resplitting has also become relatively standard (particularly for 
eliminating some data dependencies and late lowering) having a state 
variable to indicate that combine has started and such patterns should 
be exposed seems sensible.


An alternate approach would be to do something more hackish like adding 
&& flag_rerun_cse_after_loop to the condition.  That's better than what 
we do now, but not as accurate as biting the bullet and making a state 
variable for combine.


I'll cobble something together and test it.  It'll require a wider test 
because it'll touch target independent files.


Jeff



Re: Adding a new thread model to GCC

2022-12-16 Thread Jonathan Wakely via Gcc-patches
On Mon, 31 Oct 2022 at 09:19, Eric Botcazou via Libstdc++
 wrote:
> I have attached a revised version of the original patch at:
>   https://gcc.gnu.org/legacy-ml/gcc-patches/2019-06/msg01840.html
>
> This reimplements the GNU threads library on native Windows (except for the
> Objective-C specific subset) using direct Win32 API calls, in lieu of the
> implementation based on semaphores.  This base implementations requires
> Windows XP/Server 2003, which was the default minimal setting of MinGW-W64
> until end of 2020.  This also adds the support required for the C++11 threads,
> using again direct Win32 API calls; this additional layer requires Windows
> Vista/Server 2008 and is enabled only if _WIN32_WINNT >= 0x0600.
>
> This also changes libstdc++ to pass -D_WIN32_WINNT=0x0600 but only when the
> switch --enable-libstdcxx-threads is passed, which means that C++11 threads
> are still disabled by default *unless* MinGW-W64 itself is configured for
> Windows Vista/Server 2008 or later by default (this has been the case in
> the development version since end of 2020, for earlier versions you can
> configure it --with-default-win32-winnt=0x0600 to get the same effect).
>
> I only manually tested it on i686-w64-mingw32 and x86_64-w64-mingw32 but
> AdaCore has used it in their C/C++/Ada compilers for 3 years now and the
> 30_threads chapter of the libstdc++ testsuite was clean at the time.

The libgcc parts look reasonable to me, but I can't approve them.
Maybe Jonathan Yong can approve those parts as mingw-w64 target
maintainer, or maybe a libgcc approver can do so.

The libstdc++ parts are OK for trunk. IIUC they could go in
separately, they just wouldn't be very much use without the libgcc
changes.


> 2022-10-31  Eric Botcazou  
>
> libgcc/
> * config.host (i[34567]86-*-mingw*): Add thread fragment after EH one
> as well as new i386/t-slibgcc-mingw fragment.
> (x86_64-*-mingw*): Likewise.
> * config/i386/gthr-win32.h: If _WIN32_WINNT is at least 0x0600, define
> both __GTHREAD_HAS_COND and __GTHREADS_CXX0X to 1.
> Error out if _GTHREAD_USE_MUTEX_TIMEDLOCK is 1.
> Include stdlib.h instead of errno.h and do not include _mingw.h.
> (CONST_CAST2): Add specific definition for C++.
> (ATTRIBUTE_UNUSED): New macro.
> (__UNUSED_PARAM): Delete.
> Define WIN32_LEAN_AND_MEAN before including windows.h.
> (__gthread_objc_data_tls): Use TLS_OUT_OF_INDEXES instead of 
> (DWORD)-1.
> (__gthread_objc_init_thread_system): Likewise.
> (__gthread_objc_thread_get_data): Minor tweak.
> (__gthread_objc_condition_allocate): Use ATTRIBUTE_UNUSED.
> (__gthread_objc_condition_deallocate): Likewise.
> (__gthread_objc_condition_wait): Likewise.
> (__gthread_objc_condition_broadcast): Likewise.
> (__gthread_objc_condition_signal): Likewise.
> Include sys/time.h.
> (__gthr_win32_DWORD): New typedef.
> (__gthr_win32_HANDLE): Likewise.
> (__gthr_win32_CRITICAL_SECTION): Likewise.
> (__gthr_win32_CONDITION_VARIABLE): Likewise.
> (__gthread_t): Adjust.
> (__gthread_key_t): Likewise.
> (__gthread_mutex_t): Likewise.
> (__gthread_recursive_mutex_t): Likewise.
> (__gthread_cond_t): New typedef.
> (__gthread_time_t): Likewise.
> (__GTHREAD_MUTEX_INIT_DEFAULT): Delete.
> (__GTHREAD_RECURSIVE_MUTEX_INIT_DEFAULT): Likewise.
> (__GTHREAD_COND_INIT_FUNCTION): Define.
> (__GTHREAD_TIME_INIT): Likewise.
> (__gthr_i486_lock_cmp_xchg): Delete.
> (__gthr_win32_create): Declare.
> (__gthr_win32_join): Likewise.
> (__gthr_win32_self): Likewise.
> (__gthr_win32_detach): Likewise.
> (__gthr_win32_equal): Likewise.
> (__gthr_win32_yield): Likewise.
> (__gthr_win32_mutex_destroy): Likewise.
> (__gthr_win32_cond_init_function): Likewise if __GTHREADS_HAS_COND is 
> 1.
> (__gthr_win32_cond_broadcast): Likewise.
> (__gthr_win32_cond_signal): Likewise.
> (__gthr_win32_cond_wait): Likewise.
> (__gthr_win32_cond_timedwait): Likewise.
> (__gthr_win32_recursive_mutex_init_function): Delete.
> (__gthr_win32_recursive_mutex_lock): Likewise.
> (__gthr_win32_recursive_mutex_unlock): Likewise.
> (__gthr_win32_recursive_mutex_destroy): Likewise.
> (__gthread_create): New inline function.
> (__gthread_join): Likewise.
> (__gthread_self): Likewise.
> (__gthread_detach): Likewise.
> (__gthread_equal): Likewise.
> (__gthread_yield): Likewise.
> (__gthread_cond_init_function): Likewise if __GTHREADS_HAS_COND is 1.
> (__gthread_cond_broadcast): Likewise.
> (__gthread_cond_signal): Likewise.
> (__gthread_cond_wait): Likewise.
> (__gthread_cond_timedwait): Likewise.
> 

Re: [PATCH] c++: empty captured var as template argument [PR107437]

2022-12-16 Thread Jason Merrill via Gcc-patches

On 12/16/22 11:45, Patrick Palka wrote:

Here we're rejecting the use of the captured 't' (of empty type) as a
template argument ultimately because convert_nontype_argument checks
potentiality using is_constant_expression which returns false for
captured variables since want_rval=false.  But in this case an
lvalue-to-rvalue conversion of the argument is implied, so I believe we
should be checking is_rvalue_constant_expression.

This patch defines an rvalue version of is_nondep_const_expr and uses
it in convert_nontype_argument when the target type is a non-reference.

Bootstrapped and regtested on x86_64-pc-linux-gnu, does this look right?
Alternatively, since 'non_dep' in convert_nontype_argument controls only
whether we should call instantiate_non_dependent_expr, it seems weird to
me that we care about potentiality at all.  So I experimented with using
instantiation_dependent_expression_p here instead, and that seemed to
work well and even fixed the dg-ice'd test cpp1z/constexpr-lambda26.C.
In r12-7564-gec0f53a3a542e7 we made a similar change to decltype.


This approach sounds good to me.


PR c++/107437

gcc/cp/ChangeLog:

* constexpr.cc (is_nondependent_rvalue_constant_expression):
Define.
* cp-tree.h (is_nondependent_rvalue_constant_expression):
Declare.
* pt.cc (convert_nontype_argument): Use it instead of the
non-rvalue version when converting to a non-reference type.

gcc/testsuite/ChangeLog:

* g++.dg/cpp1y/lambda-generic-107437.C: New test.
---
  gcc/cp/constexpr.cc   | 10 +
  gcc/cp/cp-tree.h  |  1 +
  gcc/cp/pt.cc  |  4 +++-
  .../g++.dg/cpp1y/lambda-generic-107437.C  | 22 +++
  4 files changed, 36 insertions(+), 1 deletion(-)
  create mode 100644 gcc/testsuite/g++.dg/cpp1y/lambda-generic-107437.C

diff --git a/gcc/cp/constexpr.cc b/gcc/cp/constexpr.cc
index e43d92864f5..77e0e261cca 100644
--- a/gcc/cp/constexpr.cc
+++ b/gcc/cp/constexpr.cc
@@ -10139,6 +10139,16 @@ is_nondependent_constant_expression (tree t)
  && !instantiation_dependent_expression_p (t));
  }
  
+/* As above, but expect an rvalue.  */

+
+bool
+is_nondependent_rvalue_constant_expression (tree t)
+{
+  return (!type_unknown_p (t)
+ && is_rvalue_constant_expression (t)
+ && !instantiation_dependent_expression_p (t));
+}
+
  /* Returns true if T is a potential static initializer expression that is not
 instantiation-dependent.  */
  
diff --git a/gcc/cp/cp-tree.h b/gcc/cp/cp-tree.h

index 0d6c234b3b0..2d4c5cc3ebd 100644
--- a/gcc/cp/cp-tree.h
+++ b/gcc/cp/cp-tree.h
@@ -8471,6 +8471,7 @@ extern bool potential_constant_expression   (tree);
  extern bool is_constant_expression (tree);
  extern bool is_rvalue_constant_expression (tree);
  extern bool is_nondependent_constant_expression (tree);
+extern bool is_nondependent_rvalue_constant_expression (tree);
  extern bool is_nondependent_static_init_expression (tree);
  extern bool is_static_init_expression(tree);
  extern bool is_std_allocator (tree);
diff --git a/gcc/cp/pt.cc b/gcc/cp/pt.cc
index bc566ab702b..87caea5f202 100644
--- a/gcc/cp/pt.cc
+++ b/gcc/cp/pt.cc
@@ -7318,7 +7318,9 @@ convert_nontype_argument (tree type, tree expr, 
tsubst_flags_t complain)
&& has_value_dependent_address (expr))
  /* If we want the address and it's value-dependent, don't fold.  */;
else if (processing_template_decl
-  && is_nondependent_constant_expression (expr))
+  && (TYPE_REF_P (type)
+  ? is_nondependent_constant_expression (expr)
+  : is_nondependent_rvalue_constant_expression (expr)))
  non_dep = true;
if (error_operand_p (expr))
  return error_mark_node;
diff --git a/gcc/testsuite/g++.dg/cpp1y/lambda-generic-107437.C 
b/gcc/testsuite/g++.dg/cpp1y/lambda-generic-107437.C
new file mode 100644
index 000..934f863659d
--- /dev/null
+++ b/gcc/testsuite/g++.dg/cpp1y/lambda-generic-107437.C
@@ -0,0 +1,22 @@
+// PR c++/107437
+// { dg-do compile { target c++11 } }
+
+struct integral_constant {
+  constexpr operator int() const { return 42; }
+};
+
+template
+struct A {
+  static constexpr int value = N;
+};
+
+template
+void f(T t) {
+  [=](auto) {
+A a; // { dg-bogus "constant" }
+return a.value;
+  }(0);
+}
+
+template void f(integral_constant);
+




[PATCH] gccrs: avoid printing to stderr in selftest::rust_flatten_list

2022-12-16 Thread David Malcolm via Gcc-patches
Successfully bootstrapped & regrtested on x86_64-pc-linux-gnu.

OK for trunk?

gcc/rust/ChangeLog:
* resolve/rust-ast-resolve-item.cc (selftest::rust_flatten_list):
Remove output to stderr.

Signed-off-by: David Malcolm 
---
 gcc/rust/resolve/rust-ast-resolve-item.cc | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/gcc/rust/resolve/rust-ast-resolve-item.cc 
b/gcc/rust/resolve/rust-ast-resolve-item.cc
index 0c38f28d530..1276e845acc 100644
--- a/gcc/rust/resolve/rust-ast-resolve-item.cc
+++ b/gcc/rust/resolve/rust-ast-resolve-item.cc
@@ -1202,9 +1202,6 @@ rust_flatten_list (void)
   auto paths = std::vector ();
   Rust::Resolver::flatten_list (list, paths);
 
-  for (auto  : paths)
-fprintf (stderr, "%s\n", path.as_string ().c_str ());
-
   ASSERT_TRUE (!paths.empty ());
   ASSERT_EQ (paths.size (), 2);
   ASSERT_EQ (paths[0].get_segments ()[0].as_string (), "foo");
-- 
2.26.3



[PATCH] gccrs: add selftest-rust-gdb and selftest-rust-valgrind "make" targets

2022-12-16 Thread David Malcolm via Gcc-patches
Add "make" targets to make it easy to run the rust selftests under gdb
and under valgrind via:
  make selftest-rust-gdb
and
  make selftest-rust-valgrind
respectively, similar to analogous "make" targets in the C and C++
frontends.

Successfully bootstrapped & regrtested on x86_64-pc-linux-gnu.

OK for trunk?

gcc/rust/ChangeLog:
* Make-lang.in (selftest-rust-gdb): New.
(selftest-rust-valgrind): New.

Signed-off-by: David Malcolm 
---
 gcc/rust/Make-lang.in | 12 
 1 file changed, 12 insertions(+)

diff --git a/gcc/rust/Make-lang.in b/gcc/rust/Make-lang.in
index 681ac7b3fee..76015b3426b 100644
--- a/gcc/rust/Make-lang.in
+++ b/gcc/rust/Make-lang.in
@@ -275,6 +275,18 @@ s-selftest-rust: $(RUST_SELFTEST_DEPS)
$(GCC_FOR_TARGET) $(RUST_SELFTEST_FLAGS)
$(STAMP) $@
 
+# Convenience methods for running rust selftests under gdb:
+.PHONY: selftest-rust-gdb
+selftest-rust-gdb: $(RUST_SELFTEST_DEPS)
+   $(GCC_FOR_TARGET) $(RUST_SELFTEST_FLAGS) \
+ -wrapper gdb,--args
+
+# Convenience methods for running rust selftests under valgrind:
+.PHONY: selftest-rust-valgrind
+selftest-rust-valgrind: $(RUST_SELFTEST_DEPS)
+   $(GCC_FOR_TARGET) $(RUST_SELFTEST_FLAGS) \
+ -wrapper valgrind,--leak-check=full
+
 # Install info documentation for the front end, if it is present in the source 
directory. This target
 # should have dependencies on info files that should be installed.
 rust.install-info:
-- 
2.26.3



Re: [PATCH] RISC-V: Fix up some wording in the mcpu/mtune comment

2022-12-16 Thread Palmer Dabbelt

On Mon, 28 Nov 2022 13:37:25 PST (-0800), Palmer Dabbelt wrote:

gcc/ChangeLog:

* config/riscv/riscv.cc (riscv_option_override): Fix comment
wording.
---
Just stumbled on this one looking at the output of that sed script.  The
script itself was fine, the original comment was to blame.
---
 gcc/config/riscv/riscv.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 05bdba5ab4d..26c11507895 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -5978,7 +5978,7 @@ riscv_option_override (void)
 target_flags |= MASK_FDIV;

   /* Handle -mtune, use -mcpu if -mtune is not given, and use default -mtune
- if -mtune and -mcpu both not given.  */
+ if both -mtune and -mcpu are not given.  */
   cpu = riscv_parse_tune (riscv_tune_string ? riscv_tune_string :
  (riscv_cpu_string ? riscv_cpu_string :
   RISCV_TUNE_STRING_DEFAULT));


Committed.


Re: [PATCH] RISC-V: Note that __builtin_riscv_pause() implies Xgnuzihintpausestate

2022-12-16 Thread Palmer Dabbelt

On Mon, 28 Nov 2022 10:45:51 PST (-0800), Palmer Dabbelt wrote:

On Fri, 18 Nov 2022 09:01:08 PST (-0800), Palmer Dabbelt wrote:

On Thu, 17 Nov 2022 22:59:08 PST (-0800), Kito Cheng wrote:

Wait, what's Xgnuzihintpausestate???


I just made it up, it's defined right next to the name like those
profile extensions are.  I figured that's the most RISC-V way to define
something like this, but we could just drop it and run with the
definition -- IIRC we just stuck a comment in for Linux and QEMU, I
doubt anyone is actually going to implement the "doesn't touch PC"
version of pause.


Just checking up on this one.  I don't care a ton about the name, just
that we document where we're intentionally violating the specs.


I'm just committing this one, no big deal if you want to change the 
wording.  I just want it out of my queue.







On Fri, Nov 18, 2022 at 12:30 PM Palmer Dabbelt  wrote:


gcc/ChangeLog:

* doc/extend.texi (__builtin_riscv_pause): Imply
Xgnuzihintpausestate.
---
 gcc/doc/extend.texi | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index b1dd39e64b8..26f14e61bc8 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -21103,7 +21103,9 @@ Returns the value that is currently set in the 
@samp{tp} register.
 @end deftypefn

 @deftypefn {Built-in Function}  void __builtin_riscv_pause (void)
-Generates the @code{pause} (hint) machine instruction.
+Generates the @code{pause} (hint) machine instruction.  This implies the
+Xgnuzihintpausestate extension, which redefines the @code{pause} instruction to
+change architectural state.
 @end deftypefn

 @node RX Built-in Functions
--
2.38.1



[PATCH] c++: empty captured var as template argument [PR107437]

2022-12-16 Thread Patrick Palka via Gcc-patches
Here we're rejecting the use of the captured 't' (of empty type) as a
template argument ultimately because convert_nontype_argument checks
potentiality using is_constant_expression which returns false for
captured variables since want_rval=false.  But in this case an
lvalue-to-rvalue conversion of the argument is implied, so I believe we
should be checking is_rvalue_constant_expression.

This patch defines an rvalue version of is_nondep_const_expr and uses
it in convert_nontype_argument when the target type is a non-reference.

Bootstrapped and regtested on x86_64-pc-linux-gnu, does this look right?
Alternatively, since 'non_dep' in convert_nontype_argument controls only
whether we should call instantiate_non_dependent_expr, it seems weird to
me that we care about potentiality at all.  So I experimented with using
instantiation_dependent_expression_p here instead, and that seemed to
work well and even fixed the dg-ice'd test cpp1z/constexpr-lambda26.C.
In r12-7564-gec0f53a3a542e7 we made a similar change to decltype.

PR c++/107437

gcc/cp/ChangeLog:

* constexpr.cc (is_nondependent_rvalue_constant_expression):
Define.
* cp-tree.h (is_nondependent_rvalue_constant_expression):
Declare.
* pt.cc (convert_nontype_argument): Use it instead of the
non-rvalue version when converting to a non-reference type.

gcc/testsuite/ChangeLog:

* g++.dg/cpp1y/lambda-generic-107437.C: New test.
---
 gcc/cp/constexpr.cc   | 10 +
 gcc/cp/cp-tree.h  |  1 +
 gcc/cp/pt.cc  |  4 +++-
 .../g++.dg/cpp1y/lambda-generic-107437.C  | 22 +++
 4 files changed, 36 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/g++.dg/cpp1y/lambda-generic-107437.C

diff --git a/gcc/cp/constexpr.cc b/gcc/cp/constexpr.cc
index e43d92864f5..77e0e261cca 100644
--- a/gcc/cp/constexpr.cc
+++ b/gcc/cp/constexpr.cc
@@ -10139,6 +10139,16 @@ is_nondependent_constant_expression (tree t)
  && !instantiation_dependent_expression_p (t));
 }
 
+/* As above, but expect an rvalue.  */
+
+bool
+is_nondependent_rvalue_constant_expression (tree t)
+{
+  return (!type_unknown_p (t)
+ && is_rvalue_constant_expression (t)
+ && !instantiation_dependent_expression_p (t));
+}
+
 /* Returns true if T is a potential static initializer expression that is not
instantiation-dependent.  */
 
diff --git a/gcc/cp/cp-tree.h b/gcc/cp/cp-tree.h
index 0d6c234b3b0..2d4c5cc3ebd 100644
--- a/gcc/cp/cp-tree.h
+++ b/gcc/cp/cp-tree.h
@@ -8471,6 +8471,7 @@ extern bool potential_constant_expression   (tree);
 extern bool is_constant_expression (tree);
 extern bool is_rvalue_constant_expression (tree);
 extern bool is_nondependent_constant_expression (tree);
+extern bool is_nondependent_rvalue_constant_expression (tree);
 extern bool is_nondependent_static_init_expression (tree);
 extern bool is_static_init_expression(tree);
 extern bool is_std_allocator (tree);
diff --git a/gcc/cp/pt.cc b/gcc/cp/pt.cc
index bc566ab702b..87caea5f202 100644
--- a/gcc/cp/pt.cc
+++ b/gcc/cp/pt.cc
@@ -7318,7 +7318,9 @@ convert_nontype_argument (tree type, tree expr, 
tsubst_flags_t complain)
   && has_value_dependent_address (expr))
 /* If we want the address and it's value-dependent, don't fold.  */;
   else if (processing_template_decl
-  && is_nondependent_constant_expression (expr))
+  && (TYPE_REF_P (type)
+  ? is_nondependent_constant_expression (expr)
+  : is_nondependent_rvalue_constant_expression (expr)))
 non_dep = true;
   if (error_operand_p (expr))
 return error_mark_node;
diff --git a/gcc/testsuite/g++.dg/cpp1y/lambda-generic-107437.C 
b/gcc/testsuite/g++.dg/cpp1y/lambda-generic-107437.C
new file mode 100644
index 000..934f863659d
--- /dev/null
+++ b/gcc/testsuite/g++.dg/cpp1y/lambda-generic-107437.C
@@ -0,0 +1,22 @@
+// PR c++/107437
+// { dg-do compile { target c++11 } }
+
+struct integral_constant {
+  constexpr operator int() const { return 42; }
+};
+
+template
+struct A {
+  static constexpr int value = N;
+};
+
+template
+void f(T t) {
+  [=](auto) {
+A a; // { dg-bogus "constant" }
+return a.value;
+  }(0);
+}
+
+template void f(integral_constant);
+
-- 
2.39.0.56.g57e2c6ebbe



Re: [RFC PATCH] ipa-cp: Speculatively call specialized functions

2022-12-16 Thread Manolis Tsamis
On Mon, Nov 14, 2022 at 9:37 AM Richard Biener
 wrote:
>
> On Sun, Nov 13, 2022 at 4:38 PM Christoph Muellner
>  wrote:
> >
> > From: mtsamis 
> >
> > The IPA CP pass offers a wide range of optimizations, where most of them
> > lead to specialized functions that are called from a call site.
> > This can lead to multiple specialized function clones, if more than
> > one call-site allows such an optimization.
> > If not all call-sites can be optimized, the program might end
> > up with call-sites to the original function.
> >
> > This pass assumes that non-optimized call-sites (i.e. call-sites
> > that don't call specialized functions) are likely to be called
> > with arguments that would allow calling specialized clones.
> > Since we cannot guarantee this (for obvious reasons), we can't
> > replace the existing calls. However, we can introduce dynamic
> > guards that test the arguments for the collected constants
> > and calls the specialized function if there is a match.
> >
> > To demonstrate the effect, let's consider the following program part:
> >
> >   func_1()
> > myfunc(1)
> >   func_2()
> > myfunc(2)
> >   func_i(i)
> > myfunc(i)
> >
> > In this case the transformation would do the following:
> >
> >   func_1()
> > myfunc.constprop.1() // myfunc() with arg0 == 1
> >   func_2()
> > myfunc.constprop.2() // myfunc() with arg0 == 2
> >   func_i(i)
> > if (i == 1)
> >   myfunc.constprop.1() // myfunc() with arg0 == 1
> > else if (i == 2)
> >   myfunc.constprop.2() // myfunc() with arg0 == 2
> > else
> >   myfunc(i)
> >
> > The pass consists of two main parts:
> > * collecting all specialized functions and the argument/constant pair(s)
> > * insertion of the guards during materialization
> >
> > The patch integrates well into ipa-cp and related IPA functionality.
> > Given the nature of IPA, the changes are touching many IPA-related
> > files as well as call-graph data structures.
> >
> > The impact of the dynamic guard is expected to be less than the speedup
> > gained by enabled optimizations (e.g. inlining or constant propagation).
>
> I don't see any limits on the number of callee candidates or the complexity
> of the guard.  Is there any reason to not factor the guards into a wrapper
> function to avoid bloating cold call sites and to allow inlining to decide
> where the expansion is useful?
>
> Skimming the patch I noticed an #if 0 commented assert with a comment
> that this was to be temporary?
>

I have sent a V2 of this with both issues addressed. It also contains a number
of other important fixes.

Link: https://gcc.gnu.org/pipermail/gcc-patches/2022-December/608639.html

Thanks!
Manolis

> Thanks,
> Richard.
>
> > PR ipa/107667
> > gcc/Changelog:
> >
> > * cgraph.cc (cgraph_add_edge_to_call_site_hash): Add support for 
> > guarded specialized edges.
> > (cgraph_edge::set_call_stmt): Likewise.
> > (symbol_table::create_edge): Likewise.
> > (cgraph_edge::remove): Likewise.
> > (cgraph_edge::make_speculative): Likewise.
> > (cgraph_edge::make_specialized): Likewise.
> > (cgraph_edge::remove_specializations): Likewise.
> > (cgraph_edge::redirect_call_stmt_to_callee): Likewise.
> > (cgraph_edge::dump_edge_flags): Likewise.
> > (verify_speculative_call): Likewise.
> > (verify_specialized_call): Likewise.
> > (cgraph_node::verify_node): Likewise.
> > * cgraph.h (class GTY): Add new class that contains info of 
> > specialized edges.
> > * cgraphclones.cc (cgraph_edge::clone): Add support for guarded 
> > specialized edges.
> > (cgraph_node::set_call_stmt_including_clones): Likewise.
> > * ipa-cp.cc (want_remove_some_param_p): Likewise.
> > (create_specialized_node): Likewise.
> > (add_specialized_edges): Likewise.
> > (ipcp_driver): Likewise.
> > * ipa-fnsummary.cc (redirect_to_unreachable): Likewise.
> > (ipa_fn_summary_t::duplicate): Likewise.
> > (analyze_function_body): Likewise.
> > (estimate_edge_size_and_time): Likewise.
> > (remap_edge_summaries): Likewise.
> > * ipa-inline-transform.cc (inline_transform): Likewise.
> > * ipa-inline.cc (edge_badness): Likewise.
> >  lto-cgraph.cc (lto_output_edge): Likewise.
> > (input_edge): Likewise.
> > * tree-inline.cc (copy_bb): Likewise.
> > * value-prof.cc (gimple_sc): Add function to create guarded 
> > specializations.
> > * value-prof.h (gimple_sc): Likewise.
> >
> > Signed-off-by: Manolis Tsamis 
> > ---
> >  gcc/cgraph.cc   | 316 +++-
> >  gcc/cgraph.h| 102 
> >  gcc/cgraphclones.cc |  30 
> >  gcc/common.opt  |   4 +
> >  gcc/ipa-cp.cc   | 105 
> >  gcc/ipa-fnsummary.cc|  42 +
> >  gcc/ipa-inline-transform.cc |  11 ++
> >  

[Patch] nvptx/mkoffload.cc: Add dummy proc for OpenMP rev-offload table [PR108098]

2022-12-16 Thread Tobias Burnus

Seems to be a CUDA JIT issue - which is fixed by adding a dummy procedure.

Lightly tested with 4 systems at hand, where 2 failed before. One had 10.2 and
the other had some ancient CUDA where 'nvptx-smi' did not print a CUDA version
and requires -mptx=3.1.
(I did check that offloading indeed happened and no hostfallback was done.)

OK for mainline?

Tobias
-
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
München, HRB 106955
nvptx/mkoffload.cc: Add dummy proc for OpenMP rev-offload table [PR108098]

Seemingly, the ptx JIT of CUDA <= 10.2 replaces function pointers in global
variables by NULL if a translation does not contain any executable code. It
works with CUDA 11.1.  The code of this commit is about reverse offload;
having NULL values disables the side of reverse offload during image load.

Solution is the same as found by Thomas for a related issue: Adding a dummy
procedure. Cf. the PR of this issue and Thomas' patch
"nvptx: Support global constructors/destructors via 'collect2'"
https://gcc.gnu.org/pipermail/gcc-patches/2022-December/607749.html

As that approach also works here:

Co-authored-by: Thomas Schwinge 

gcc/
	PR libgomp/108098

	* config/nvptx/mkoffload.cc (process): Emit dummy procedure
	alongside reverse-offload function table to prevent NULL values
	of the function addresses.

---
 gcc/config/nvptx/mkoffload.cc | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/gcc/config/nvptx/mkoffload.cc b/gcc/config/nvptx/mkoffload.cc
index 5d89ba8..8306aa0 100644
--- a/gcc/config/nvptx/mkoffload.cc
+++ b/gcc/config/nvptx/mkoffload.cc
@@ -357,6 +357,20 @@ process (FILE *in, FILE *out, uint32_t omp_requires)
 	fputc (sm_ver2[i], out);
   fprintf (out, "\"\n\t\".file 1 \\\"\\\"\"\n");
 
+  /* WORKAROUND - see PR 108098
+	 It seems as if older CUDA JIT compiler optimizes the function pointers
+	 in offload_func_table to NULL, which can be prevented by adding a
+	 dummy procedure. With CUDA 11.1, it seems to work fine without
+	 workaround while CUDA 10.2 as some ancient version have need the
+	 workaround. Assuming CUDA 11.0 fixes it, emitting it could be
+	 restricted to 'if (sm_ver2[0] < 8 && version2[0] < 7)' as sm_80 and
+	 PTX ISA 7.0 are new in CUDA 11.0; for 11.1 it would be sm_86 and
+	 PTX ISA 7.1.  */
+  fprintf (out, "\n\t\".func __dummy$func ( );\"\n");
+  fprintf (out, "\t\".func __dummy$func ( )\"\n");
+  fprintf (out, "\t\"{\"\n");
+  fprintf (out, "\t\"}\"\n");
+
   size_t fidx = 0;
   for (id = func_ids; id; id = id->next)
 	{


[PATCH v2] ipa-cp: Speculatively call specialized functions

2022-12-16 Thread Manolis Tsamis
The IPA CP pass offers a wide range of optimizations, where most of them
lead to specialized functions that are called from a call site.
This can lead to multiple specialized function clones, if more than
one call-site allows such an optimization.
If not all call-sites can be optimized, the program might end
up with call-sites to the original function.

This pass assumes that non-optimized call-sites (i.e. call-sites
that don't call specialized functions) are likely to be called
with arguments that would allow calling specialized clones.
Since we cannot guarantee this (for obvious reasons), we can't
replace the existing calls. However, we can introduce dynamic
guards that test the arguments for the collected constants
and calls the specialized function if there is a match.

To demonstrate the effect, let's consider the following program part:

  func_1()
myfunc(1)
  func_2()
myfunc(2)
  func_i(i)
myfunc(i)

In this case the transformation would do the following:

  func_1()
myfunc.constprop.1() // myfunc() with arg0 == 1
  func_2()
myfunc.constprop.2() // myfunc() with arg0 == 2
  func_i(i)
if (i == 1)
  myfunc.constprop.1() // myfunc() with arg0 == 1
else if (i == 2)
  myfunc.constprop.2() // myfunc() with arg0 == 2
else
  myfunc(i)

The pass consists of two main parts:
* collecting all specialized functions and the argument/constant pair(s)
* insertion of the guards during materialization

The patch integrates well into ipa-cp and related IPA functionality.
Given the nature of IPA, the changes are touching many IPA-related
files as well as call-graph data structures.

The impact of the dynamic guard is expected to be less than the speedup
gained by enabled optimizations (e.g. inlining or constant propagation).

gcc/Changelog:

* cgraph.cc (cgraph_add_edge_to_call_site_hash): Add support for 
guarded specialized edges.
(cgraph_edge::set_call_stmt): Likewise.
(symbol_table::create_edge): Likewise.
(cgraph_edge::remove): Likewise.
(cgraph_edge::make_speculative): Likewise.
(cgraph_edge::make_specialized): Likewise.
(cgraph_edge::remove_specializations): Likewise.
(cgraph_edge::redirect_call_stmt_to_callee): Likewise.
(cgraph_edge::dump_edge_flags): Likewise.
(verify_speculative_call): Likewise.
(verify_specialized_call): Likewise.
(cgraph_node::verify_node): Likewise.
* cgraph.h (class GTY): Add new class that contains info of specialized 
edges.
* cgraphclones.cc (cgraph_edge::clone): Add support for guarded 
specialized edges.
(cgraph_node::set_call_stmt_including_clones): Likewise.
* ipa-cp.cc (want_remove_some_param_p): Likewise.
(create_specialized_node): Likewise.
(add_specialized_edges): Likewise.
(ipcp_driver): Likewise.
* ipa-fnsummary.cc (redirect_to_unreachable): Likewise.
(ipa_fn_summary_t::duplicate): Likewise.
(analyze_function_body): Likewise.
(estimate_edge_size_and_time): Likewise.
(remap_edge_summaries): Likewise.
* ipa-inline-transform.cc (inline_transform): Likewise.
* ipa-inline.cc (edge_badness): Likewise.
 lto-cgraph.cc (lto_output_edge): Likewise.
(input_edge): Likewise.
* tree-inline.cc (copy_bb): Likewise.
* value-prof.cc (gimple_sc): Add function to create guarded 
specializations.
* value-prof.h (gimple_sc): Likewise.

Signed-off-by: Manolis Tsamis 

---

Changes in v2:
  - Added params ipa-guarded-specialization-guard-complexity and
ipa-guarded-specializations-per-edge to control the complexity and 
number
of specialized edges that are created.
  - Create separate clones for the guarded specialized calls.
  - Add more validation checks for the invariants of specialized edges.
  - Fix bugs and improve robustness.

 gcc/cgraph.cc   | 372 ++--
 gcc/cgraph.h| 105 ++
 gcc/cgraphclones.cc |  42 
 gcc/common.opt  |   4 +
 gcc/ipa-cp.cc   | 171 -
 gcc/ipa-fnsummary.cc|  42 
 gcc/ipa-inline-transform.cc |  16 ++
 gcc/ipa-inline.cc   |   1 +
 gcc/lto-cgraph.cc   |  46 +
 gcc/params.opt  |   8 +
 gcc/tree-inline.cc  |  75 +++-
 gcc/value-prof.cc   | 223 +
 gcc/value-prof.h|   1 +
 13 files changed, 1087 insertions(+), 19 deletions(-)

diff --git a/gcc/cgraph.cc b/gcc/cgraph.cc
index f15cb47c8b8..356b1f64756 100644
--- a/gcc/cgraph.cc
+++ b/gcc/cgraph.cc
@@ -718,18 +718,24 @@ cgraph_add_edge_to_call_site_hash (cgraph_edge *e)
  one indirect); always hash the direct one.  */
   if (e->speculative && e->indirect_unknown_callee)
 return;
+  /* There are potentially multiple specialization edges for every
+ specialized call; 

Re: [PATCH, nvptx, 1/2] Reimplement libgomp barriers for nvptx

2022-12-16 Thread Tom de Vries via Gcc-patches

On 9/21/22 09:45, Chung-Lin Tang wrote:

Hi Tom,
I had a patch submitted earlier, where I reported that the current way 
of implementing
barriers in libgomp on nvptx created a quite significant performance 
drop on some SPEChpc2021

benchmarks:
https://gcc.gnu.org/pipermail/gcc-patches/2022-September/600818.html

That previous patch wasn't accepted well (admittedly, it was kind of a 
hack).

So in this patch, I tried to (mostly) re-implement team-barriers for NVPTX.



Ack.

Basically, instead of trying to have the GPU do CPU-with-OS-like things 
that it isn't suited for,
barriers are implemented simplistically with bar.* synchronization 
instructions.
Tasks are processed after threads have joined, and only if 
team->task_count != 0


(arguably, there might be a little bit of performance forfeited where 
earlier arriving threads
could've been used to process tasks ahead of other threads. But that 
again falls into requiring
implementing complex futex-wait/wake like behavior. Really, that kind of 
tasking is not what target

offloading is usually used for)



Please try to add this insight somewhere as a comment in the code, f.i. 
in the header comment of bar.h.



Implementation highlight notes:
1. gomp_team_barrier_wake() is now an empty function (threads never 
"wake" in the usual manner)

2. gomp_team_barrier_cancel() now uses the "exit" PTX instruction.
3. gomp_barrier_wait_last() now is implemented using "bar.arrive"

4. gomp_team_barrier_wait_end()/gomp_team_barrier_wait_cancel_end():
    The main synchronization is done using a 'bar.red' instruction. This 
reduces across all threads
    the condition (team->task_count != 0), to enable the task processing 
down below if any thread
    created a task. (this bar.red usage required the need of the second 
GCC patch in this series)


This patch has been tested on x86_64/powerpc64le with nvptx offloading, 
using libgomp, ovo, omptests,
and sollve_vv testsuites, all without regressions. Also verified that 
the SPEChpc 2021 521.miniswp_t
and 534.hpgmgfv_t performance regressions that occurred in the GCC12 
cycle has been restored to

devel/omp/gcc-11 (OG11) branch levels. Is this okay for trunk?



AFAIU the waiters and lock fields are longer used, so they can be removed.

Yes, LGTM, please apply (after the other one).

Thanks for addressing this.

FWIW, tested on NVIDIA RTX A2000 with driver 525.60.11.

(also suggest backporting to GCC12 branch, if performance regression can 
be considered a defect)




That's ok, but wait a while after applying on trunk before doing that, 
say a month.


Thanks,
- Tom


Thanks,
Chung-Lin

libgomp/ChangeLog:

2022-09-21  Chung-Lin Tang  

 * config/nvptx/bar.c (generation_to_barrier): Remove.
 (futex_wait,futex_wake,do_spin,do_wait): Remove.
 (GOMP_WAIT_H): Remove.
 (#include "../linux/bar.c"): Remove.
 (gomp_barrier_wait_end): New function.
 (gomp_barrier_wait): Likewise.
 (gomp_barrier_wait_last): Likewise.
 (gomp_team_barrier_wait_end): Likewise.
 (gomp_team_barrier_wait): Likewise.
 (gomp_team_barrier_wait_final): Likewise.
 (gomp_team_barrier_wait_cancel_end): Likewise.
 (gomp_team_barrier_wait_cancel): Likewise.
 (gomp_team_barrier_cancel): Likewise.
 * config/nvptx/bar.h (gomp_team_barrier_wake): Remove
 prototype, add new static inline function.


Re: Ping---[V3][PATCH 2/2] Add a new warning option -Wstrict-flex-arrays.

2022-12-16 Thread Qing Zhao via Gcc-patches


> On Dec 16, 2022, at 4:17 AM, Richard Biener  wrote:
> 
> On Thu, 15 Dec 2022, Qing Zhao wrote:
> 
>> 
>> 
>>> On Dec 15, 2022, at 2:47 AM, Richard Biener  wrote:
>>> 
>>> On Wed, 14 Dec 2022, Qing Zhao wrote:
>>> 
 Hi, Richard,
 
 I guess that we now agreed on the following:
 
 “ the information that we ran into a trailing array but didn't consider 
 it a flex array because of -fstrict-flex-arrays is always a useful 
 information”
 
 The only thing we didn’t decide is:
 
 A. Amend such new information to -Warray-bounds when 
 -fstrict-flex-arrays=N (N>0) specified.
 
 OR
 
 B. Issue such new information with a new warning option 
 -Wstrict-flex-arrays when -fstrict-flex-arrays=N (N>0) specified.
 
 My current patch implemented B. 
>>> 
>>> Plus it implements it to specify a different flex-array variant for
>>> the extra diagnostic.
>> Could you clarify a little bit on this? (Don’t quite understand…)
>>> 
 If you think A is better, I will change the patch as A. 
>>> 
>>> I would tend to A since, as I said, it's useful information that
>>> shouldn't be hidden and not adding an option removes odd combination
>>> possibilities such as -Wno-array-bounds -Wstrict-flex-arrays.
>> 
>> With current implementation, the above combination will ONLY report the 
>> misuse of trailing array as flex-array.  No out-of-bounds warnings 
>> issued.
>> 
>>> In particular I find, say, -fstrict-flex-arrays=2 -Wstrict-flex-arrays=1
>>> hardly useful.
>> 
>> The above combination will NOT happen, because there is NO level argument 
>> for -Wstrict-flex-arrays.
>> 
>> The only combination will be:-fstrict-flex-arrays=N -Wstrict-flex-arrays
>> 
>> When N > 0, -Wstrict-flex-arrays will report any misuse of trailing arrays 
>> as flexible array per the value of N.
>>> 
>>> But I'm interested in other opinions.
>> 
>> Adding a separate -Wstrict-flex-arrays will provide users a choice to ONLY 
>> look at the mis-use of trailing arrays as flex-arrays.  Without this new 
>> option, such information will be buried into tons of out-of-bounds messges. 
>> 
>> I think this is the major benefit to have one separate new warning 
>> -Wstrict-flex-arrays. 
>> 
>> Do we need to provide the users this choice?
> 
> Ah, OK - I can see the value of auditing code this way before
> enabling -fstrict-flex-arrays.
Yes, I think the major benefit of this option is to help users to identify all 
the places where the trailing arrays are misused as flex-arrays at different 
level of -fstrict-flex-arrays=N, then update their source code accordingly. And 
finally can enable -fstrict-flex-arrays by default.
> 
>> +  if (opts->x_warn_strict_flex_arrays)
>> +if (opts->x_flag_strict_flex_arrays == 0)
>> +  {
>> + opts->x_warn_strict_flex_arrays = 0;
>> + warning_at (UNKNOWN_LOCATION, 0,
>> + "%<-Wstrict-flex-arrays%> is ignored when"
>> + " %<-fstrict-flex-arrays%> does not present");
> 
> "is not present”.
Okay.
> 
> The patch is OK with that change.
Thanks! Will commit the patch after the change.
> 
> Thanks and sorry for the slow process ...

Thank you for your patience and questions.
The discussion is very helpful since I was not 100% sure whether this new 
warning is necessary or not in the beginning, but now after this discussion I 
feel confident that it’s a necessary option to be added.

Qing

> 
> Richard.
> 
>> Thanks.
>> 
>> Qing
>>> 
>>> Thanks,
>>> Richard.
>>> 
 Let me know your opinion.
 
 thanks.
 
 Qing
 
 
> On Dec 14, 2022, at 4:03 AM, Richard Biener  wrote:
> 
> On Tue, 13 Dec 2022, Qing Zhao wrote:
> 
>> Richard, 
>> 
>> Do you have any decision on this one? 
>> Do we need this warning option For GCC? 
> 
> Looking at the testcases it seems that the diagnostic amends
> -Warray-bounds diagnostics for trailing but not flexible arrays?
> Wouldn't it be better to generally diagnose this, so have
> -Warray-bounds, with -fstrict-flex-arrays, for
> 
> struct X { int a[1]; };
> int foo (struct X *p)
> {
> return p->a[1];
> }
> 
> emit
> 
> warning: array subscript 1 is above array bounds ...
> note: the trailing array is only a flexible array member with 
> -fno-strict-flex-arrays
> 
> ?  Having -Wstrict-flex-arrays=N and N not agree with the
> -fstrict-flex-arrays level sounds hardly useful to me but the
> information that we ran into a trailing array but didn't consider
> it a flex array because of -fstrict-flex-arrays is always a
> useful information?
> 
> But maybe I misunderstood this new diagnostic?
> 
> Thanks,
> Richard.
> 
> 
>> thanks.
>> 
>> Qing
>> 
>>> On Dec 6, 2022, at 11:18 AM, Qing Zhao  wrote:
>>> 
>>> '-Wstrict-flex-arrays'
>>>  Warn about inproper usages of flexible array members according 

[committed] libstdc++: Fix self-move for std::weak_ptr [PR108118]

2022-12-16 Thread Jonathan Wakely via Gcc-patches
Tested x86_64-lixnu. Pushed to trunk. I'll backport this too.

-- >8 --

I think an alternative fix would be something like:

  _M_ptr = std::exchange(rhs._M_ptr, nullptr);
  _M_refcount = std::move(rhs._M_refcount);

The standard's move-and-swap implementation generates smaller code at
all levels except -O0 and -Og, so it seems simplest to just do what the
standard says.

libstdc++-v3/ChangeLog:

PR libstdc++/108118
* include/bits/shared_ptr_base.h (weak_ptr::operator=):
Implement as move-and-swap exactly as specified in the standard.
* testsuite/20_util/weak_ptr/cons/self_move.cc: New test.
---
 libstdc++-v3/include/bits/shared_ptr_base.h   |  4 +---
 .../20_util/weak_ptr/cons/self_move.cc| 19 +++
 2 files changed, 20 insertions(+), 3 deletions(-)
 create mode 100644 libstdc++-v3/testsuite/20_util/weak_ptr/cons/self_move.cc

diff --git a/libstdc++-v3/include/bits/shared_ptr_base.h 
b/libstdc++-v3/include/bits/shared_ptr_base.h
index d9230b72203..c22b397a194 100644
--- a/libstdc++-v3/include/bits/shared_ptr_base.h
+++ b/libstdc++-v3/include/bits/shared_ptr_base.h
@@ -2049,9 +2049,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   __weak_ptr&
   operator=(__weak_ptr&& __r) noexcept
   {
-   _M_ptr = __r._M_ptr;
-   _M_refcount = std::move(__r._M_refcount);
-   __r._M_ptr = nullptr;
+   __weak_ptr(std::move(__r)).swap(*this);
return *this;
   }
 
diff --git a/libstdc++-v3/testsuite/20_util/weak_ptr/cons/self_move.cc 
b/libstdc++-v3/testsuite/20_util/weak_ptr/cons/self_move.cc
new file mode 100644
index 000..c890d2ba94d
--- /dev/null
+++ b/libstdc++-v3/testsuite/20_util/weak_ptr/cons/self_move.cc
@@ -0,0 +1,19 @@
+// { dg-do run { target c++11 } }
+
+#include 
+#include 
+
+void
+test_self_move()
+{
+  std::shared_ptr sp(new int(66));
+  std::weak_ptr wp(sp);
+  wp = std::move(wp); // PR libstdc++/108118
+  std::shared_ptr sp2(wp);
+  VERIFY(sp2 == sp);
+}
+
+int main()
+{
+  test_self_move();
+}
-- 
2.38.1



Add '-Wno-complain-wrong-lang', and use it in 'gcc/testsuite/lib/target-supports.exp:check_compile' and elsewhere (was: Make '-frust-incomplete-and-experimental-compiler-do-not-use' a 'Common' option)

2022-12-16 Thread Thomas Schwinge
Hi!

On 2022-12-15T16:17:05+0100, Jakub Jelinek  wrote:
> On Thu, Dec 15, 2022 at 04:01:33PM +0100, Thomas Schwinge wrote:
>> Or, options are applicable to just one front end, and can just be a no-op
>> for others, for shared-language compilation.  For example, '-nostdinc++',
>> or '-frust-incomplete-and-experimental-compiler-do-not-use' need not
>> necessarily emit a diagnostic, but can just just be ignored by 'cc1',
>> 'f951', 'lto1'.
>
> One simple change could be to add a new warning option and use it for
> complain_wrong_lang warnings:
>   else if (ok_langs[0] != '\0')
> /* Eventually this should become a hard error IMO.  */
> warning (0, "command-line option %qs is valid for %s but not for %s",
>  text, ok_langs, bad_lang);

(By the way, that comment was originally added in 2003-06-07
commit 2772ef3ef33609dd64209323e9418a847685971a
"Move handling of lang-specific switches to toplev".)

>   else
> /* Happens for -Werror=warning_name.  */
> warning (0, "%<-Werror=%> argument %qs is not valid for %s",
>  text, bad_lang);
> We could keep the existing behavior, but give users (and our testsuite)
> a way to silence that warning if they are ok with it applying only to a
> subset of languages.
> Then one could use
> -frust-incomplete-and-experimental-compiler-do-not-use -Wno-whatever
> or add that -Wno-whatever in check_compile if the snippet is different
> language from main language of the testsuite (or always) etc.

Like in the attaached
"Add '-Wno-complain-wrong-lang', and use it in 
'gcc/testsuite/lib/target-supports.exp:check_compile' and elsewhere",
for example?

Anything that 'gcc/opts-global.cc:complain_wrong_lang' might do is cut
short by '-Wno-complain-wrong-lang', not just the one 'warning'
diagnostic.  This corresponds to what already exists via
'lang_hooks.complain_wrong_lang_p'.

The 'gcc/opts-common.cc:prune_options' changes follow the same rationale
as PR67640 "driver passes -fdiagnostics-color= always last": we need to
process '-Wno-complain-wrong-lang' early, so that it properly affects
other options appearing before it on the command line.


In the test suites, a number of existing test cases explicitly match the
"command-line option [...] is valid for [...] but not for [...]"
diagnostic with 'dg-warning'; I've left those alone.  On the other hand,
I've changed 'dg-prune-output' of this diagnostic into
'-Wno-complain-wrong-lang' usage.  I'm happy to adjust that in either way
anyone may prefer.  I've not looked for test cases that just to silence
this diagnostic use more general 'dg-prune-output', 'dg-excess-errors',
'-w', etc.

In the GCC/D test suite, I see a number of:

cc1plus: warning: command-line option '-fpreview=in' is valid for D but not 
for C++

cc1plus: warning: command-line option '-fextern-std=c++11' is valid for D 
but not for C++

It's not clear to me how they, despite this, do achieve
'PASS: [...] (test for excess errors)'?  Maybe I haven't found where that
gets pruned/ignored?


In addition to the test suites, I'm also seeing:

build-gcc/build-x86_64-pc-linux-gnu/libcpp/config.log:cc1: warning: command 
line option '-fno-rtti' is valid for C++/ObjC++ but not for C [enabled by 
default]
build-gcc/gcc/config.log:cc1: warning: command-line option '-fno-rtti' is 
valid for C++/D/ObjC++ but not for C
build-gcc/gcc/config.log:cc1: warning: command-line option '-fno-rtti' is 
valid for C++/D/ObjC++ but not for C
build-gcc/libbacktrace/config.log:cc1: warning: command-line option 
'-fno-rtti' is valid for C++/D/ObjC++ but not for C
build-gcc/libcc1/config.log:cc1: warning: command-line option '-fno-rtti' 
is valid for C++/D/ObjC++ but not for C
build-gcc/libcpp/config.log:cc1: warning: command-line option '-fno-rtti' 
is valid for C++/D/ObjC++ but not for C
build-gcc/lto-plugin/config.log:cc1: warning: command-line option 
'-fno-rtti' is valid for C++/D/ObjC++ but not for C
build-gcc/x86_64-pc-linux-gnu/libatomic/config.log:cc1: warning: 
command-line option '-fno-rtti' is valid for C++/D/ObjC++ but not for C
build-gcc/x86_64-pc-linux-gnu/libbacktrace/config.log:cc1: warning: 
command-line option '-fno-rtti' is valid for C++/D/ObjC++ but not for C
build-gcc/x86_64-pc-linux-gnu/libffi/config.log:cc1: warning: command-line 
option '-fno-rtti' is valid for C++/D/ObjC++ but not for C
build-gcc/x86_64-pc-linux-gnu/libgfortran/config.log:cc1: warning: 
command-line option '-fno-rtti' is valid for C++/D/ObjC++ but not for C
build-gcc/x86_64-pc-linux-gnu/libgo/config.log:cc1: warning: command-line 
option '-fno-rtti' is valid for C++/D/ObjC++ but not for C
build-gcc/x86_64-pc-linux-gnu/libgomp/config.log:cc1: warning: command-line 
option '-fno-rtti' is valid for C++/D/ObjC++ but not for C
build-gcc/x86_64-pc-linux-gnu/libitm/config.log:cc1: warning: command-line 
option '-fno-rtti' is valid for C++/D/ObjC++ but not for C
build-gcc/x86_64-pc-linux-gnu/libobjc/config.log:cc1: 

Re: [PATCH, nvptx, 2/2] Reimplement libgomp barriers for nvptx: bar.red instruction support in GCC

2022-12-16 Thread Tom de Vries via Gcc-patches

On 9/21/22 09:45, Chung-Lin Tang wrote:

Hi Tom, following the first patch.

This new barrier implementation I posted in the first patch uses the 
'bar.red' instruction. > Usually this could've been easily done with a single line of inline

assembly. However I quickly
realized that because the NVPTX GCC port is implemented with all virtual 
general registers,

we don't have a register constraint usable to select "predicate registers".
Since bar.red uses predicate typed values, I can't create it directly 
using inline asm.


So it appears that the most simple way of accessing it is with a target 
builtin.
The attached patch adds bar.red instructions to the nvptx port, and 
__builtin_nvptx_bar_red_* builtins
to use it. The code should support all variations of bar.red (and, or, 
and popc operations).


(This support was used to implement the first libgomp barrier patch, so 
must be approved together)




What I conclude from what you're telling me here is that this is the 
first patch in the series rather than the second.


So, LGTM, please apply it, unless it cannot be applied by itself without 
causing regressions, in which case you need to fix those first.


IWBN if this also included standalone test-cases in 
gcc/testsuite/gcc.target/nvptx, but I suppose we can live without for now.


Thanks,
- Tom


Thanks,
Chung-Lin

2022-09-21  Chung-Lin Tang  

gcc/ChangeLog:

 * config/nvptx/nvptx.cc (nvptx_print_operand): Add 'p'
 case, adjust comments.
 (enum nvptx_builtins): Add NVPTX_BUILTIN_BAR_RED_AND,
 NVPTX_BUILTIN_BAR_RED_OR, and NVPTX_BUILTIN_BAR_RED_POPC.
 (nvptx_expand_bar_red): New function.
 (nvptx_init_builtins):
 Add DEFs of __builtin_nvptx_bar_red_[and/or/popc].
 (nvptx_expand_builtin): Use nvptx_expand_bar_red to expand
 NVPTX_BUILTIN_BAR_RED_[AND/OR/POPC] cases.

 * config/nvptx/nvptx.md (define_c_enum "unspecv"): Add
 UNSPECV_BARRED_AND, UNSPECV_BARRED_OR, and UNSPECV_BARRED_POPC.
 (BARRED): New int iterator.
 (barred_op,barred_mode,barred_ptxtype): New int attrs.
 (nvptx_barred_): New define_insn.


[PATCH, m2] Add missing m2.stage{profile,feedback} to Make-lang.in

2022-12-16 Thread Gaius Mulley via Gcc-patches



Add missing profile and feedback hooks consistent with
all other frontends.

gcc/m2/ChangeLog:

* Make-lang.in (m2.stageprofile): Added.
(m2.stagefeedback) Added.

Bootstrapped and tested using
configure --with-build-config=bootstrap-lto-lean --enable-languages=m2
make profiledbootstrap-lean
ok for trunk?


diff --git a/gcc/m2/Make-lang.in b/gcc/m2/Make-lang.in
index a8bd7fe4d19..be33e012dc9 100644
--- a/gcc/m2/Make-lang.in
+++ b/gcc/m2/Make-lang.in
@@ -442,6 +442,12 @@ m2.stage3: stage3-start
 m2.stage4: stage4-start
-mv m2/*$(objext) stage4/m2
 
+m2.stageprofile: stageprofile-start
+   -mv m2/*$(objext) stageprofile/m2
+
+m2.stagefeedback: stagefeedback-start
+   -mv m2/*$(objext) stagefeedback/m2
+
 quit: force
echo "calling exit"
exit 1


Re: [Patch] gcc-changelog: Add warning for auto-added files

2022-12-16 Thread Martin Liška
On 12/16/22 13:33, Tobias Burnus wrote:
> Thus, the auto-add feature does not seem to be an often used feature - 
> neither on
> purpose nor accidentally. And glancing at the results, I think it was as 
> often used
> on purpose as it was used accidentally.

Hello.

Understood, I do support the newly added warning with a small patch tweak.

> 
> I was wondering whether the commit hook should print this warning/note or not.

I would print it.

> It can be helpful in case something too much got committed, but for the usual
> case that the file was missed, it simply comes too late. (Frankly, I don't
> know whether the hook currently runs with '-v' or not.)

Cheers,
Martindiff --git a/contrib/gcc-changelog/git_commit.py b/contrib/gcc-changelog/git_commit.py
index b9a60312099..e82fbcacd3e 100755
--- a/contrib/gcc-changelog/git_commit.py
+++ b/contrib/gcc-changelog/git_commit.py
@@ -22,6 +22,7 @@ import difflib
 import os
 import re
 import sys
+from collections import defaultdict
 
 default_changelog_locations = {
 'c++tools',
@@ -707,7 +708,7 @@ class GitCommit:
 msg += f' (did you mean "{candidates[0]}"?)'
 details = '\n'.join(difflib.Differ().compare([file], [candidates[0]])).rstrip()
 self.errors.append(Error(msg, file, details))
-auto_add_warnings = {}
+auto_add_warnings = defaultdict(list)
 for file in sorted(changed_files - mentioned_files):
 if not self.in_ignored_location(file):
 if file in self.new_files:
@@ -740,10 +741,7 @@ class GitCommit:
 file = file[len(entry.folder):].lstrip('/')
 entry.lines.append('\t* %s: New file.' % file)
 entry.files.append(file)
-if entry.folder not in auto_add_warnings:
-auto_add_warnings[entry.folder] = [file]
-else:
-auto_add_warnings[entry.folder].append(file)
+auto_add_warnings[entry.folder].append(file)
 else:
 msg = 'new file in the top-level folder not mentioned in a ChangeLog'
 self.errors.append(Error(msg, file))
@@ -763,11 +761,9 @@ class GitCommit:
 self.errors.append(Error(error, pattern))
 for entry, val in auto_add_warnings.items():
 if len(val) == 1:
-self.warnings.append('Auto-added new file \'%s/%s\''
- % (entry, val[0]))
+self.warnings.append(f"Auto-added new file '{entry}/{val[0]}'")
 else:
-self.warnings.append('Auto-added %d new files in \'%s\''
- % (len(val), entry))
+self.warnings.append(f"Auto-added {len(val)} new files in '{entry}'")
 
 def check_for_correct_changelog(self):
 for entry in self.changelog_entries:



[PATCH] Simplify gimple_assign_load

2022-12-16 Thread Richard Biener via Gcc-patches
The following simplifies and outlines gimple_assign_load.  In
particular it is not necessary to get at the base of the possibly
loaded expression but just handle the case of a single handled
component wrapping a non-memory operand.

Bootstrapped and tested on x86_64-unknown-linux-gnu, not yet
sure if I queue this for stage1 or push it next week.

* gimple.h (gimple_assign_load): Outline...
* gimple.cc (gimple_assign_load): ... here.  Avoid
get_base_address and instead just strip the outermost
handled component, treating a remaining handled component
as load.
---
 gcc/gimple.cc | 20 
 gcc/gimple.h  | 17 +
 2 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/gcc/gimple.cc b/gcc/gimple.cc
index dd054e16453..e7bb7dd12f6 100644
--- a/gcc/gimple.cc
+++ b/gcc/gimple.cc
@@ -1797,6 +1797,26 @@ gimple_assign_unary_nop_p (gimple *gs)
   == TYPE_MODE (TREE_TYPE (gimple_assign_rhs1 (gs);
 }
 
+/* Return true if GS is an assignment that loads from its rhs1.  */
+
+bool
+gimple_assign_load_p (const gimple *gs)
+{
+  tree rhs;
+  if (!gimple_assign_single_p (gs))
+return false;
+  rhs = gimple_assign_rhs1 (gs);
+  if (TREE_CODE (rhs) == WITH_SIZE_EXPR)
+return true;
+  if (handled_component_p (rhs))
+rhs = TREE_OPERAND (rhs, 0);
+  return (handled_component_p (rhs)
+ || DECL_P (rhs)
+ || TREE_CODE (rhs) == MEM_REF
+ || TREE_CODE (rhs) == TARGET_MEM_REF);
+}
+
+
 /* Set BB to be the basic block holding G.  */
 
 void
diff --git a/gcc/gimple.h b/gcc/gimple.h
index adbeb063186..b6ece9f409b 100644
--- a/gcc/gimple.h
+++ b/gcc/gimple.h
@@ -1629,6 +1629,7 @@ tree gimple_call_nonnull_arg (gcall *);
 bool gimple_assign_copy_p (gimple *);
 bool gimple_assign_ssa_name_copy_p (gimple *);
 bool gimple_assign_unary_nop_p (gimple *);
+bool gimple_assign_load_p (const gimple *);
 void gimple_set_bb (gimple *, basic_block);
 void gimple_assign_set_rhs_from_tree (gimple_stmt_iterator *, tree);
 void gimple_assign_set_rhs_with_ops (gimple_stmt_iterator *, enum tree_code,
@@ -2952,22 +2953,6 @@ gimple_store_p (const gimple *gs)
   return lhs && !is_gimple_reg (lhs);
 }
 
-/* Return true if GS is an assignment that loads from its rhs1.  */
-
-static inline bool
-gimple_assign_load_p (const gimple *gs)
-{
-  tree rhs;
-  if (!gimple_assign_single_p (gs))
-return false;
-  rhs = gimple_assign_rhs1 (gs);
-  if (TREE_CODE (rhs) == WITH_SIZE_EXPR)
-return true;
-  rhs = get_base_address (rhs);
-  return (DECL_P (rhs)
- || TREE_CODE (rhs) == MEM_REF || TREE_CODE (rhs) == TARGET_MEM_REF);
-}
-
 
 /* Return true if S is a type-cast assignment.  */
 
-- 
2.35.3


Re: [PATCH] gcov: Fix -fprofile-update=atomic

2022-12-16 Thread Sebastian Huber




On 16.12.22 13:09, Richard Biener wrote:

On Fri, Dec 16, 2022 at 11:39 AM Sebastian Huber
  wrote:

On 16.12.22 10:47, Richard Biener wrote:

No, if you select -fprofile-update=atomic, then the updates shall be
atomic from my point of view. If a fallback is acceptable, then you can
use -fprofile-update=prefer-atomic. Using the fallback in
-fprofile-update=atomic is a bug which prevents the use of gcov for
multi-threaded applications on the lower end targets which do not have
atomic operations in hardware.

Ah, OK.  So the fallback there is libatomic calls as well then?  Note
not all targets support libatomic, for those the failure mode is likely
a link error (which might be fine, but we eventually might want to
amend documentation to indicate this failure mode).

It seems these library calls caused issues in the past:

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77466

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77378

Hmm, those are testsuite-isms in some way but of course
users would run into the same issue, needing explicit
-latomic (where available).  I suppose target specs could
automatically add that for -fprofile-update=atomic but this
would need to be specified at link time as well then.


Why can't we provide libatomic for all targets? Is gthr.h not always 
available?





One option could be to emit calls to a new libgcov function:

__gcov_inc_counter(counter) -> updated value

This function could use a __gthread_mutex_t mutex for updates. This ends
up probably with quite a bad performance.

But that's eventually what libatomic will do as well as fallback.


For libatomic, hosts can implement a protect_start() and protect_end() 
function. On RTEMS, this is implemented like this:


#include 

static inline UWORD
protect_start (void *ptr)
{
  return _Libatomic_Protect_start (ptr);
}

static inline void
protect_end (void *ptr, UWORD isr_level)
{
  _Libatomic_Protect_end (ptr, isr_level);
}

On single-core systems, this is mapped to interrupt disable/enable. This 
is quite efficient (compared to a mutex).




I don't have a good idea here.

Do you have to explicitely link -latomic on RISCV?


For RTEMS, libatomic is always added to the linker command line:

gcc/config/rtems.h:"%{qrtems:--start-group -lrtemsbsp -lrtemscpu 
-latomic -lc -lgcc --end-group}"


For riscv, there seems to be also a linux special case:

gcc/config/riscv/linux.h:  " %{pthread:" LD_AS_NEEDED_OPTION " -latomic 
" LD_NO_AS_NEEDED_OPTION "}"
gcc/config/riscv/linux.h:#define LIB_SPEC GNU_USER_TARGET_LIB_SPEC " 
-latomic "



--
embedded brains GmbH
Herr Sebastian HUBER
Dornierstr. 4
82178 Puchheim
Germany
email: sebastian.hu...@embedded-brains.de
phone: +49-89-18 94 741 - 16
fax:   +49-89-18 94 741 - 08

Registergericht: Amtsgericht München
Registernummer: HRB 157899
Vertretungsberechtigte Geschäftsführer: Peter Rasmussen, Thomas Dörfler
Unsere Datenschutzerklärung finden Sie hier:
https://embedded-brains.de/datenschutzerklaerung/


[Patch] gcc-changelog: Add warning for auto-added files

2022-12-16 Thread Tobias Burnus

This patch adds a warning for automatically added files. For check_email,
it is always printed - for git_check_commit.py only in verbose mode.

The './check_email.py' variant that implicitly uses the 'patches' directory does
not seem to get used that often, given that I had to remove the ', False'.
The same change was already done for the usage './check_email.py '
in commit r12-868-g7b4bae0acb14a1076df3234e905442bbdf9503dd !

Regarding the patch: I keep wondering whether it should be WARN/warnings or
note/inform.

Disclaimer: I have an older flake8/python-flake8 - thus, my pytest might
have missed some test issues. I think it did test sufficiently many,
but I don't know.

OK for mainline?

 * * *

Running './contrib/gcc-changelog/git_check_commit.py -v HEAD~1..HEAD' and
ignoring those commits without WARN, I only see the following commits
(see attached file).

Namely, 30 commits with:
One commit with 3 ChangeLog directories (single file, 5, and 2 new files, 
respectively)
One commit with 2 ChangeLog directories (one file each)
28 commits with files only auto-added to a single ChangeLog file.

Over all ChangeLog files and commits, there were:
22 new single file additions
11 multi-file additions: 4 times two new files, and then: 5, 7, 11, 12, 13, 28, 
36.

Thus, all in all 0.33% of the commits have auto-added files. It looks as if 
several
had just missed that file - and adding a single file or two seems to be fine.

In cases of eccbd7fcee5bbfc47731e8de83c44eee2e3dcc4b it was moving testcases 
into a
new directory; the commit log stated only their original location (and in the 
description
the new directory).

In case of 5fee5ec362f7a243f459e6378fd49dfc89dc9fb5, the commit log was already
quite long and the 'd: Import' patch had "WARN: Auto-added 36 new files in 
'libphobos'".
This, seems to be one of the more useful cases.

While the modula import commit 1eee94d351774cdc2efc8ee508b82d065184c6ee 
seemingly
only missed to list the (1+5+2) files, given the long list of '(New file)', it
seems as if they were just missed.

* * *

Thus, the auto-add feature does not seem to be an often used feature - neither 
on
purpose nor accidentally. And glancing at the results, I think it was as often 
used
on purpose as it was used accidentally.

I was wondering whether the commit hook should print this warning/note or not.
It can be helpful in case something too much got committed, but for the usual
case that the file was missed, it simply comes too late. (Frankly, I don't
know whether the hook currently runs with '-v' or not.)

Tobias
-
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
München, HRB 106955
gcc-changelog: Add warning for auto-added files

git_email.py prints now a warning for files added automatically.
git_check_commit.py does likewise but only with --verbose.
It prints one line per ChangeLog file, either stating the file
or if more than one the number of files.

contrib/ChangeLog:

	* gcc-changelog/git_check_commit.py (__main__): With -v print a
	warning for the auto-added files.
	* gcc-changelog/git_commit.py (GitCommit.__init__): Add self.warnings.
	(GitCommit.check_mentioned_files): Add warning for auto-added files.
	(GitCommit.print_warnings): New function.
	* gcc-changelog/git_email.py (__main__): Remove bogus argument to
	GitEmail constructor; print auto-added-files warning.
	* gcc-changelog/test_email.py (test_auto_add_file_1,
	test_auto_add_file_2): New tests.
	* gcc-changelog/test_patches.txt: Add two test cases.

diff --git a/contrib/gcc-changelog/git_check_commit.py b/contrib/gcc-changelog/git_check_commit.py
index 2e3e8cbeb77..2b9f2381f20 100755
--- a/contrib/gcc-changelog/git_check_commit.py
+++ b/contrib/gcc-changelog/git_check_commit.py
@@ -42,7 +42,13 @@ for git_commit in parse_git_revisions(args.git_path, args.revisions):
 if git_commit.success:
 if args.print_changelog:
 git_commit.print_output()
+if args.verbose and git_commit.warnings:
+for warning in git_commit.warnings:
+print('WARN: %s' % warning)
 else:
+if args.verbose and git_commit.warnings:
+for warning in git_commit.warnings:
+print('WARN: %s' % warning)
 for error in git_commit.errors:
 print('ERR: %s' % error)
 if args.verbose and error.details:
diff --git a/contrib/gcc-changelog/git_commit.py b/contrib/gcc-changelog/git_commit.py
index 66d68de03a5..b9a60312099 100755
--- a/contrib/gcc-changelog/git_commit.py
+++ b/contrib/gcc-changelog/git_commit.py
@@ -304,6 +304,7 @@ class GitCommit:
 self.changes = None
 self.changelog_entries = []
 self.errors = []
+self.warnings = []
 self.top_level_authors = []
 self.co_authors = []
 self.top_level_prs = []

Re: [PATCH] gcov: Fix -fprofile-update=atomic

2022-12-16 Thread Richard Biener via Gcc-patches
On Fri, Dec 16, 2022 at 11:39 AM Sebastian Huber
 wrote:
>
> On 16.12.22 10:47, Richard Biener wrote:
> >> No, if you select -fprofile-update=atomic, then the updates shall be
> >> atomic from my point of view. If a fallback is acceptable, then you can
> >> use -fprofile-update=prefer-atomic. Using the fallback in
> >> -fprofile-update=atomic is a bug which prevents the use of gcov for
> >> multi-threaded applications on the lower end targets which do not have
> >> atomic operations in hardware.
> > Ah, OK.  So the fallback there is libatomic calls as well then?  Note
> > not all targets support libatomic, for those the failure mode is likely
> > a link error (which might be fine, but we eventually might want to
> > amend documentation to indicate this failure mode).
>
> It seems these library calls caused issues in the past:
>
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77466
>
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77378

Hmm, those are testsuite-isms in some way but of course
users would run into the same issue, needing explicit
-latomic (where available).  I suppose target specs could
automatically add that for -fprofile-update=atomic but this
would need to be specified at link time as well then.

> One option could be to emit calls to a new libgcov function:
>
> __gcov_inc_counter(counter) -> updated value
>
> This function could use a __gthread_mutex_t mutex for updates. This ends
> up probably with quite a bad performance.

But that's eventually what libatomic will do as well as fallback.

I don't have a good idea here.

Do you have to explicitely link -latomic on RISCV?

Richard.

> --
> embedded brains GmbH
> Herr Sebastian HUBER
> Dornierstr. 4
> 82178 Puchheim
> Germany
> email: sebastian.hu...@embedded-brains.de
> phone: +49-89-18 94 741 - 16
> fax:   +49-89-18 94 741 - 08
>
> Registergericht: Amtsgericht München
> Registernummer: HRB 157899
> Vertretungsberechtigte Geschäftsführer: Peter Rasmussen, Thomas Dörfler
> Unsere Datenschutzerklärung finden Sie hier:
> https://embedded-brains.de/datenschutzerklaerung/


[PATCH] middle-end/108086 - avoid unshare_expr when remapping SSA names

2022-12-16 Thread Richard Biener via Gcc-patches
r0-89280-g129a37fc319db8 added unsharing to remap_ssa_name but
that wasn't in the version of the patch posted.  That has some
non-trivial cost through mostly_copy_tree_r and copy_tree_r but
more importantly it doesn't seem to be necessary.  I've successfully
bootstrapped and tested with an assert we only get
tree_node_can_be_shared trees here.

Bootstrapped and tested on x86_64-unknown-linux-gnu with all
languages.

Pushed to trunk.

PR middle-end/108086
* tree-inline.cc (remap_ssa_name): Do not unshare the
result from the decl_map.
---
 gcc/tree-inline.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/tree-inline.cc b/gcc/tree-inline.cc
index c802792fa07..b471774ce51 100644
--- a/gcc/tree-inline.cc
+++ b/gcc/tree-inline.cc
@@ -183,7 +183,7 @@ remap_ssa_name (tree name, copy_body_data *id)
  return name;
}
 
-  return unshare_expr (*n);
+  return *n;
 }
 
   if (processing_debug_stmt)
-- 
2.35.3


Re: [PATCH 1/2] ivopts: Revert computation of address cost complexity.

2022-12-16 Thread Richard Biener via Gcc-patches
On Fri, Dec 16, 2022 at 12:37 PM Dimitrije Milosevic
 wrote:
>
>
> Hi Richard,
>
> > The only documentation on complexity I find is
> >
> >   int64_t cost; /* The runtime cost.  */
> >   unsigned complexity;  /* The estimate of the complexity of the code for
> >the computation (in no concrete units --
> >complexity field should be larger for more
> >complex expressions and addressing modes).  */
> >
> > and complexity is used as tie-breaker only when cost is equal.  Given that
> > shouldn't unsupported addressing modes have higher complexity?  I'll note
> > that there's nothing "unsupported", each "unsupported" address computation
> > is lowered into supported pieces.  "unsupported" maybe means that
> > "cost" isn't fully covered by address-cost and compensation stmts might
> > be costed in quantities not fully compatible with that?
>
> Correct, that's what I was aiming for initially - before f9f69dd that was the 
> case,
> "unsupported" addressing modes had higher complexities.
> Also, that's what I meant by "unsupported" as well, thanks.
>
> > That said, "complexity" seems to only complicate things :/  We do have the
> > tie-breaker on preferring less IVs.  complexity was added in
> > r0-85562-g6e8c65f6621fb0 as part of fixing PR34711.
>
> I agree that the complexity part is just (kind of) out there, not really 
> strongly
> defined. I'm not sure how to feel about merging complexity into the cost part
> of an address cost, though.
>
> > If it's really only about the "complexity" value then each
> > compensation step should
> > add to the complexity?
>
> That could be the way to go. Also worth verifying is that we compensate for
> each case of an unsupported addressing mode.

Yes.  Also given complexity is only a tie-breaker we should cost the
compensation
somehow, but then complexity doesn't look necessary ...

Meh.

>
> Kind regards,
> Dimitrije
>
> From: Richard Biener 
> Sent: Friday, December 16, 2022 10:58 AM
> To: Dimitrije Milosevic 
> Cc: Jeff Law ; gcc-patches@gcc.gnu.org 
> ; Djordje Todorovic 
> Subject: Re: [PATCH 1/2] ivopts: Revert computation of address cost 
> complexity.
>
> On Thu, Dec 15, 2022 at 4:26 PM Dimitrije Milosevic
>  wrote:
> >
> > Hi Richard,
> >
> > Sorry for the delayed response, I couldn't find the time to fully focus on 
> > this topic.
> >
> > > I'm not sure this is accurate but at least the cost of using an 
> > > unsupported
> > > addressing mode should be at least that of the compensating code to
> > > mangle it to a supported form.
> >
> > I'm pretty sure IVOPTS does not filter out candidates which aren't 
> > supported by
> > the target architecture. It does, however, adjust the cost for a subset of 
> > those.
> > The adjustment code modifies only the cost part of the address cost (which
> > consists of a cost and a complexity).
> > Having said this, I'd propose two approaches:
> > 1. Cover all cases of unsupported addressing modes (if needed, I'm not 
> > entirely
> > sure they aren't already covered), leaving complexity for 
> > unsupported
> > addressing modes zero.
>
> The only documentation on complexity I find is
>
>   int64_t cost; /* The runtime cost.  */
>   unsigned complexity;  /* The estimate of the complexity of the code for
>the computation (in no concrete units --
>complexity field should be larger for more
>complex expressions and addressing modes).  */
>
> and complexity is used as tie-breaker only when cost is equal.  Given that
> shouldn't unsupported addressing modes have higher complexity?  I'll note
> that there's nothing "unsupported", each "unsupported" address computation
> is lowered into supported pieces.  "unsupported" maybe means that
> "cost" isn't fully covered by address-cost and compensation stmts might
> be costed in quantities not fully compatible with that?
>
> That said, "complexity" seems to only complicate things :/  We do have the
> tie-breaker on prefering less IVs.  complexity was added in
> r0-85562-g6e8c65f6621fb0 as part of fixing PR34711.
>
> > 2. Revert the complexity calculation (which my initial patch does), 
> > leaving
> > everything else as it is.
> > 3. A combination of both - if the control path gets into the adjustment 
> > code, we
> > use the reverted complexity calculation.
>
> If it's really only about the "complexity" value then each
> compensation step should
> add to the complexity?
>
> > I'd love to get feedback regarding this, so I could focus on a concrete 
> > approach.
> >
> > Kind regards,
> > Dimitrije
> >
> > From: Richard Biener 
> > Sent: Monday, November 7, 2022 2:35 PM
> > To: Dimitrije Milosevic 
> > Cc: Jeff Law ; gcc-patches@gcc.gnu.org 
> > ; Djordje Todorovic 
> > Subject: Re: [PATCH 1/2] ivopts: Revert computation of address cost 
> > complexity.
> >
> > On 

[OG12][committed] libgomp: Fix USM bugs

2022-12-16 Thread Andrew Stubbs
I've committed this patch to the devel/omp/gcc-12 branch. It fixes some 
missed cases in the Unified Shared Memory implementation that were 
especially noticeable in Fortran because the size of arrays are known.


This patch will have to be folded into the mainline USM patches that 
were submitted for review. I hope to get to that sometime in the new year.


Andrewlibgomp: Fix USM bugs

Fix up some USM corner cases.

libgomp/ChangeLog:

* libgomp.h (OFFSET_USM): New macro.
* target.c (gomp_map_pointer): Handle USM mappings.
(gomp_map_val): Handle OFFSET_USM.
(gomp_map_vars_internal): Move USM check earlier, and use OFFSET_USM.
Add OFFSET_USM check to the second mapping pass.
* testsuite/libgomp.fortran/usm-1.f90: New test.
* testsuite/libgomp.fortran/usm-2.f90: New test.

diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index e5345ebee53..7d55f3cf825 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -1196,6 +1196,7 @@ struct target_mem_desc;
 #define OFFSET_INLINED (~(uintptr_t) 0)
 #define OFFSET_POINTER (~(uintptr_t) 1)
 #define OFFSET_STRUCT (~(uintptr_t) 2)
+#define OFFSET_USM (~(uintptr_t) 3)
 
 /* Auxiliary structure for infrequently-used or API-specific data.  */
 
diff --git a/libgomp/target.c b/libgomp/target.c
index 50709f0677d..19db5a56f4a 100644
--- a/libgomp/target.c
+++ b/libgomp/target.c
@@ -692,6 +692,9 @@ gomp_map_pointer (struct target_mem_desc *tgt, struct 
goacc_asyncqueue *aq,
 {
   if (allow_zero_length_array_sections)
cur_node.tgt_offset = 0;
+  else if (devicep->is_usm_ptr_func
+  && devicep->is_usm_ptr_func ((void*)cur_node.host_start))
+   cur_node.tgt_offset = cur_node.host_start;
   else
{
  gomp_mutex_unlock (>lock);
@@ -928,6 +931,7 @@ gomp_map_val (struct target_mem_desc *tgt, void 
**hostaddrs, size_t i)
   switch (tgt->list[i].offset)
 {
 case OFFSET_INLINED:
+case OFFSET_USM:
   return (uintptr_t) hostaddrs[i];
 
 case OFFSET_POINTER:
@@ -1013,6 +1017,7 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
 {
   int kind = get_kind (short_mapkind, kinds, i);
   bool implicit = get_implicit (short_mapkind, kinds, i);
+  tgt->list[i].offset = 0;
   if (hostaddrs[i] == NULL
  || (kind & typemask) == GOMP_MAP_FIRSTPRIVATE_INT)
{
@@ -1020,6 +1025,15 @@ gomp_map_vars_internal (struct gomp_device_descr 
*devicep,
  tgt->list[i].offset = OFFSET_INLINED;
  continue;
}
+  else if (devicep->is_usm_ptr_func
+  && devicep->is_usm_ptr_func (hostaddrs[i]))
+   {
+ /* The memory is visible from both host and target
+so nothing needs to be moved.  */
+ tgt->list[i].key = NULL;
+ tgt->list[i].offset = OFFSET_USM;
+ continue;
+   }
   else if ((kind & typemask) == GOMP_MAP_USE_DEVICE_PTR
   || (kind & typemask) == GOMP_MAP_USE_DEVICE_PTR_IF_PRESENT)
{
@@ -1063,15 +1077,6 @@ gomp_map_vars_internal (struct gomp_device_descr 
*devicep,
tgt->list[i].offset = 0;
  continue;
}
-  else if (devicep->is_usm_ptr_func
-  && devicep->is_usm_ptr_func (hostaddrs[i]))
-   {
- /* The memory is visible from both host and target
-so nothing needs to be moved.  */
- tgt->list[i].key = NULL;
- tgt->list[i].offset = OFFSET_INLINED;
- continue;
-   }
   else if ((kind & typemask) == GOMP_MAP_STRUCT)
{
  size_t first = i + 1;
@@ -1441,6 +1446,8 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
bool implicit = get_implicit (short_mapkind, kinds, i);
if (hostaddrs[i] == NULL)
  continue;
+   if (tgt->list[i].offset == OFFSET_USM)
+ continue;
switch (kind & typemask)
  {
size_t align, len, first, last;
diff --git a/libgomp/testsuite/libgomp.fortran/usm-1.f90 
b/libgomp/testsuite/libgomp.fortran/usm-1.f90
new file mode 100644
index 000..7147e9925a2
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/usm-1.f90
@@ -0,0 +1,28 @@
+! { dg-do run }
+! { dg-require-effective-target omp_usm }
+
+! Ensure that USM works for implicit mappings.
+! This needs to cover both the initial mapping scan and the rescan that
+! happens when some of the mappings aren't no-ops (in this cases there are
+! some hidden pointers).
+
+program usm
+  use iso_fortran_env
+  use omp_lib
+  implicit none
+
+  !$omp requires unified_shared_memory
+
+  integer, parameter :: N = 1024
+  real(real64), allocatable :: x(:), y(:)
+  integer :: i
+
+  allocate(x(N), y(N))
+  !$omp target teams distribute parallel do simd
+  do i=1,N
+y(i) = x(i)
+  enddo
+
+  deallocate(x,y)
+
+end program usm
diff --git a/libgomp/testsuite/libgomp.fortran/usm-2.f90 
b/libgomp/testsuite/libgomp.fortran/usm-2.f90
new file mode 100644
index 

Re: [PATCH 1/2] ivopts: Revert computation of address cost complexity.

2022-12-16 Thread Dimitrije Milosevic

Hi Richard,

> The only documentation on complexity I find is
>
>   int64_t cost; /* The runtime cost.  */
>   unsigned complexity;  /* The estimate of the complexity of the code for
>    the computation (in no concrete units --
>    complexity field should be larger for more
>    complex expressions and addressing modes).  */
>
> and complexity is used as tie-breaker only when cost is equal.  Given that
> shouldn't unsupported addressing modes have higher complexity?  I'll note
> that there's nothing "unsupported", each "unsupported" address computation
> is lowered into supported pieces.  "unsupported" maybe means that
> "cost" isn't fully covered by address-cost and compensation stmts might
> be costed in quantities not fully compatible with that?

Correct, that's what I was aiming for initially - before f9f69dd that was the 
case,
"unsupported" addressing modes had higher complexities.
Also, that's what I meant by "unsupported" as well, thanks.

> That said, "complexity" seems to only complicate things :/  We do have the
> tie-breaker on preferring less IVs.  complexity was added in
> r0-85562-g6e8c65f6621fb0 as part of fixing PR34711.

I agree that the complexity part is just (kind of) out there, not really 
strongly
defined. I'm not sure how to feel about merging complexity into the cost part
of an address cost, though.

> If it's really only about the "complexity" value then each
> compensation step should
> add to the complexity?

That could be the way to go. Also worth verifying is that we compensate for
each case of an unsupported addressing mode.

Kind regards,
Dimitrije

From: Richard Biener 
Sent: Friday, December 16, 2022 10:58 AM
To: Dimitrije Milosevic 
Cc: Jeff Law ; gcc-patches@gcc.gnu.org 
; Djordje Todorovic 
Subject: Re: [PATCH 1/2] ivopts: Revert computation of address cost complexity. 
 
On Thu, Dec 15, 2022 at 4:26 PM Dimitrije Milosevic
 wrote:
>
> Hi Richard,
>
> Sorry for the delayed response, I couldn't find the time to fully focus on 
> this topic.
>
> > I'm not sure this is accurate but at least the cost of using an unsupported
> > addressing mode should be at least that of the compensating code to
> > mangle it to a supported form.
>
> I'm pretty sure IVOPTS does not filter out candidates which aren't supported 
> by
> the target architecture. It does, however, adjust the cost for a subset of 
> those.
> The adjustment code modifies only the cost part of the address cost (which
> consists of a cost and a complexity).
> Having said this, I'd propose two approaches:
> 1. Cover all cases of unsupported addressing modes (if needed, I'm not 
>entirely
> sure they aren't already covered), leaving complexity for unsupported
> addressing modes zero.

The only documentation on complexity I find is

  int64_t cost; /* The runtime cost.  */
  unsigned complexity;  /* The estimate of the complexity of the code for
   the computation (in no concrete units --
   complexity field should be larger for more
   complex expressions and addressing modes).  */

and complexity is used as tie-breaker only when cost is equal.  Given that
shouldn't unsupported addressing modes have higher complexity?  I'll note
that there's nothing "unsupported", each "unsupported" address computation
is lowered into supported pieces.  "unsupported" maybe means that
"cost" isn't fully covered by address-cost and compensation stmts might
be costed in quantities not fully compatible with that?

That said, "complexity" seems to only complicate things :/  We do have the
tie-breaker on prefering less IVs.  complexity was added in
r0-85562-g6e8c65f6621fb0 as part of fixing PR34711.

> 2. Revert the complexity calculation (which my initial patch does), 
>leaving
> everything else as it is.
> 3. A combination of both - if the control path gets into the adjustment 
>code, we
> use the reverted complexity calculation.

If it's really only about the "complexity" value then each
compensation step should
add to the complexity?

> I'd love to get feedback regarding this, so I could focus on a concrete 
> approach.
>
> Kind regards,
> Dimitrije
>
> From: Richard Biener 
> Sent: Monday, November 7, 2022 2:35 PM
> To: Dimitrije Milosevic 
> Cc: Jeff Law ; gcc-patches@gcc.gnu.org 
> ; Djordje Todorovic 
> Subject: Re: [PATCH 1/2] ivopts: Revert computation of address cost 
> complexity.
>
> On Wed, Nov 2, 2022 at 9:40 AM Dimitrije Milosevic
>  wrote:
> >
> > Hi Jeff,
> >
> > > This is exactly what I was trying to get to.   If the addressing mode
> > > isn't supported, then we shouldn't be picking it as a candidate.  If it
> > > is, then we've probably got a problem somewhere else in this code and
> > > this patch is likely papering over it.
>
> I'm not sure this is accurate but at least the cost of using an unsupported
> addressing 

[PATCH (pushed)] gcc-changelog: do not use PatchSet.from_filename

2022-12-16 Thread Martin Liška
Use rather PatchSet constructor where we can pass
properly opened file with newline='\n'.

contrib/ChangeLog:

* gcc-changelog/git_email.py: Use PatchSet constructor
as newline argument is not supported with older unidiff
library.
---
 contrib/gcc-changelog/git_email.py | 11 ---
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/contrib/gcc-changelog/git_email.py 
b/contrib/gcc-changelog/git_email.py
index 093c887ba4c..f3773f178ea 100755
--- a/contrib/gcc-changelog/git_email.py
+++ b/contrib/gcc-changelog/git_email.py
@@ -39,18 +39,15 @@ unidiff_supports_renaming = hasattr(PatchedFile(), 
'is_rename')
 class GitEmail(GitCommit):
 def __init__(self, filename):
 self.filename = filename
-try:
-  diff = PatchSet.from_filename(filename, newline='\n')
-except TypeError:
-  # Older versions don't have the newline argument
-  diff = PatchSet.from_filename(filename)
 date = None
 author = None
 subject = ''
 
 subject_last = False
-with open(self.filename, 'r') as f:
-lines = f.read().splitlines()
+with open(self.filename, newline='\n') as f:
+data = f.read()
+diff = PatchSet(data)
+lines = data.splitlines()
 lines = list(takewhile(lambda line: line != '---', lines))
 for line in lines:
 if line.startswith(DATE_PREFIX):
-- 
2.39.0



Re: [PATCH] gcov: Fix -fprofile-update=atomic

2022-12-16 Thread Sebastian Huber

On 16.12.22 10:47, Richard Biener wrote:

No, if you select -fprofile-update=atomic, then the updates shall be
atomic from my point of view. If a fallback is acceptable, then you can
use -fprofile-update=prefer-atomic. Using the fallback in
-fprofile-update=atomic is a bug which prevents the use of gcov for
multi-threaded applications on the lower end targets which do not have
atomic operations in hardware.

Ah, OK.  So the fallback there is libatomic calls as well then?  Note
not all targets support libatomic, for those the failure mode is likely
a link error (which might be fine, but we eventually might want to
amend documentation to indicate this failure mode).


It seems these library calls caused issues in the past:

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77466

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77378

One option could be to emit calls to a new libgcov function:

__gcov_inc_counter(counter) -> updated value

This function could use a __gthread_mutex_t mutex for updates. This ends 
up probably with quite a bad performance.


--
embedded brains GmbH
Herr Sebastian HUBER
Dornierstr. 4
82178 Puchheim
Germany
email: sebastian.hu...@embedded-brains.de
phone: +49-89-18 94 741 - 16
fax:   +49-89-18 94 741 - 08

Registergericht: Amtsgericht München
Registernummer: HRB 157899
Vertretungsberechtigte Geschäftsführer: Peter Rasmussen, Thomas Dörfler
Unsere Datenschutzerklärung finden Sie hier:
https://embedded-brains.de/datenschutzerklaerung/


[PATCH] hwasan: Add libhwasan_preinit.o

2022-12-16 Thread Jakub Jelinek via Gcc-patches
Hi!

I've noticed an inconsistency with the other sanitizers.
For -fsanitize={address,thread,leak} we link into binaries
lib*san_preinit.o such that the -lasan, -ltsan or -llsan libraries
are initialized as early as possible through .preinit_array.
The hwasan library has the same thing, but we strangely compiled
it into the library (where it apparently didn't do anything,
.preinit_array doesn't seem to be created for shared libraries),
rather than installing it like in the other 3 cases.

The following patch handles it for hwasan similarly to asan, tsan and lsan.

I don't have any hw with hwasan support, so I've just checked it
builds and installs as expected and that
gcc -fsanitize=hwaddress -o a a.c -mlam=u57
on trivial main results in .preinit_array section in the binary.

Ok for trunk?

2022-12-16  Jakub Jelinek  

* config/gnu-user.h (LIBHWASAN_EARLY_SPEC): Add libhwasan_preinit.o
to link spec if not -shared.

* hwasan/Makefile.am (nodist_toolexeclib_HEADERS): Set to
libhwasan_preinit.o.
(hwasan_files): Remove hwasan_preinit.cpp.
(libhwasan_preinit.o): Copy from hwasan_preinit.o.
* hwasan/Makefile.in: Regenerated.

--- gcc/config/gnu-user.h.jj2022-01-11 23:11:21.753299105 +0100
+++ gcc/config/gnu-user.h   2022-12-16 10:41:16.289726621 +0100
@@ -138,7 +138,8 @@ see the files COPYING3 and COPYING.RUNTI
   LD_STATIC_OPTION " --whole-archive -lasan --no-whole-archive " \
   LD_DYNAMIC_OPTION "}}%{!static-libasan:-lasan}"
 #undef LIBHWASAN_EARLY_SPEC
-#define LIBHWASAN_EARLY_SPEC "%{static-libhwasan:%{!shared:" \
+#define LIBHWASAN_EARLY_SPEC "%{!shared:libhwasan_preinit%O%s} " \
+  "%{static-libhwasan:%{!shared:" \
   LD_STATIC_OPTION " --whole-archive -lhwasan --no-whole-archive " \
   LD_DYNAMIC_OPTION "}}%{!static-libhwasan:-lhwasan}"
 #undef LIBTSAN_EARLY_SPEC
--- libsanitizer/hwasan/Makefile.am.jj  2022-05-09 09:09:21.003463752 +0200
+++ libsanitizer/hwasan/Makefile.am 2022-12-16 10:32:25.999508099 +0100
@@ -12,6 +12,7 @@ AM_CCASFLAGS = $(EXTRA_ASFLAGS)
 ACLOCAL_AMFLAGS = -I $(top_srcdir) -I $(top_srcdir)/config
 
 toolexeclib_LTLIBRARIES = libhwasan.la
+nodist_toolexeclib_HEADERS = libhwasan_preinit.o
 
 hwasan_files = \
hwasan_allocation_functions.cpp \
@@ -28,7 +29,6 @@ hwasan_files = \
hwasan_memintrinsics.cpp \
hwasan_new_delete.cpp \
hwasan_poisoning.cpp \
-   hwasan_preinit.cpp \
hwasan_report.cpp \
hwasan_setjmp_aarch64.S \
hwasan_setjmp_x86_64.S \
@@ -49,6 +49,9 @@ libhwasan_la_LIBADD += $(LIBSTDCXX_RAW_C
 
 libhwasan_la_LDFLAGS = -version-info `grep -v '^\#' $(srcdir)/libtool-version` 
$(link_libhwasan)
 
+libhwasan_preinit.o: hwasan_preinit.o
+   cp $< $@
+
 # Work around what appears to be a GNU make bug handling MAKEFLAGS
 # values defined in terms of make variables, as is the case for CC and
 # friends when we are called from the top level Makefile.
--- libsanitizer/hwasan/Makefile.in.jj  2022-05-09 09:09:21.003463752 +0200
+++ libsanitizer/hwasan/Makefile.in 2022-12-16 10:33:20.681705095 +0100
@@ -14,6 +14,7 @@
 
 @SET_MAKE@
 
+
 VPATH = @srcdir@
 am__is_gnu_make = { \
   if test -z '$(MAKELEVEL)'; then \
@@ -141,7 +142,8 @@ am__uninstall_files_from_dir = { \
 || { echo " ( cd '$$dir' && rm -f" $$files ")"; \
  $(am__cd) "$$dir" && rm -f $$files; }; \
   }
-am__installdirs = "$(DESTDIR)$(toolexeclibdir)"
+am__installdirs = "$(DESTDIR)$(toolexeclibdir)" \
+   "$(DESTDIR)$(toolexeclibdir)"
 LTLIBRARIES = $(toolexeclib_LTLIBRARIES)
 am__DEPENDENCIES_1 =
 libhwasan_la_DEPENDENCIES =  \
@@ -152,10 +154,9 @@ am__objects_1 = hwasan_allocation_functi
hwasan_fuchsia.lo hwasan_globals.lo hwasan_interceptors.lo \
hwasan_interceptors_vfork.lo hwasan_linux.lo \
hwasan_memintrinsics.lo hwasan_new_delete.lo \
-   hwasan_poisoning.lo hwasan_preinit.lo hwasan_report.lo \
-   hwasan_setjmp_aarch64.lo hwasan_setjmp_x86_64.lo \
-   hwasan_tag_mismatch_aarch64.lo hwasan_thread.lo \
-   hwasan_thread_list.lo hwasan_type_test.lo
+   hwasan_poisoning.lo hwasan_report.lo hwasan_setjmp_aarch64.lo \
+   hwasan_setjmp_x86_64.lo hwasan_tag_mismatch_aarch64.lo \
+   hwasan_thread.lo hwasan_thread_list.lo hwasan_type_test.lo
 am_libhwasan_la_OBJECTS = $(am__objects_1)
 libhwasan_la_OBJECTS = $(am_libhwasan_la_OBJECTS)
 AM_V_lt = $(am__v_lt_@AM_V@)
@@ -233,6 +234,7 @@ am__can_run_installinfo = \
 n|no|NO) false;; \
 *) (install-info --version) >/dev/null 2>&1;; \
   esac
+HEADERS = $(nodist_toolexeclib_HEADERS)
 am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
 # Read a list of newline-separated strings from the standard input,
 # and print each of them once, without duplicates.  Input order is
@@ -415,6 +417,7 @@ AM_CXXFLAGS = -Wall -W -Wno-unused-param
 AM_CCASFLAGS = $(EXTRA_ASFLAGS)
 ACLOCAL_AMFLAGS = -I $(top_srcdir) -I $(top_srcdir)/config
 toolexeclib_LTLIBRARIES = libhwasan.la

[PATCH v2] gcov: Fix -fprofile-update=atomic

2022-12-16 Thread Sebastian Huber
The code coverage support uses counters to determine which edges in the control
flow graph were executed.  If a counter overflows, then the code coverage
information is invalid.  Therefore the counter type should be a 64-bit integer.
In multithreaded applications, it is important that the counter increments are
atomic.  This is not the case by default.  The user can enable atomic counter
increments through the -fprofile-update=atomic and
-fprofile-update=prefer-atomic options.

If the hardware supports 64-bit atomic operations, then everything is fine.  If
not and -fprofile-update=prefer-atomic was chosen by the user, then non-atomic
counter increments will be used.  However, if the hardware does not support the
required atomic operations and -fprofile-atomic=update was chosen by the user,
then a warning was issued and as a forced fallback to non-atomic operations was
done.  This is probably not what a user wants.  There is still hardware on the
market which does not have atomic operations and is used for multithreaded
applications.  A user which selects -fprofile-update=atomic wants consistent
code coverage data and not random data.

This patch removes the fallback to non-atomic operations for
-fprofile-update=atomic.  If atomic operations in hardware are not available,
then a library call to libatomic is emitted.  To mitigate potential performance
issues an optimization for systems which only support 32-bit atomic operations
is provided.  Here, the edge counter increments are done like this:

  low = __atomic_add_fetch_4 (, 1, MEMMODEL_RELAXED);
  high_inc = low == 0 ? 1 : 0;
  __atomic_add_fetch_4 (, high_inc, MEMMODEL_RELAXED);

In gimple_gen_time_profiler() this split operation cannot be used, since the
updated counter value is also required.  Here, a library call is emitted.  This
is not a performance issue since the update is only done if counters[0] == 0.

gcc/ChangeLog:

* tree-profile.cc (split_atomic_increment): New.
(gimple_gen_edge_profiler): Split the atomic edge counter increment in
two 32-bit atomic operations if necessary.
(tree_profiling): Remove profile update warning and fallback.  Set
split_atomic_increment if necessary.
* doc/invoke.texi (-fprofile-update): Clarify default method.  Document
the atomic method behaviour.
---
v2:

* Check gcov_type_size for split_atomic_increment.

* Update documentation.

 gcc/doc/invoke.texi | 16 -
 gcc/tree-profile.cc | 81 +
 2 files changed, 74 insertions(+), 23 deletions(-)

diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index a50417a4ab7..6b32b659e50 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -16457,7 +16457,21 @@ while the second one prevents profile corruption by 
emitting thread-safe code.
 Using @samp{prefer-atomic} would be transformed either to @samp{atomic},
 when supported by a target, or to @samp{single} otherwise.  The GCC driver
 automatically selects @samp{prefer-atomic} when @option{-pthread}
-is present in the command line.
+is present in the command line, otherwise the default method is @samp{single}.
+
+If @samp{atomic} is selected, then the profile information is updated using
+atomic operations.  If the target does not support atomic operations in
+hardware, then function calls to @code{__atomic_fetch_add_4} or
+@code{__atomic_fetch_add_8} are emitted.  These functions are usually provided
+by the @file{libatomic} runtime library.  Not all targets provide the
+@file{libatomic} runtime library.  If it is not available for the target, then
+a linker error may happen.  Using function calls to update the profiling
+information may be a performance issue.  For targets which use 64-bit counters
+for the profiling information and support only 32-bit atomic operations, the
+performance critical profiling updates are done using two 32-bit atomic
+operations for each counter update.  If a signal interrupts these two
+operations updating a counter, then the profiling information may be in an
+inconsistent state.
 
 @item -fprofile-filter-files=@var{regex}
 @opindex fprofile-filter-files
diff --git a/gcc/tree-profile.cc b/gcc/tree-profile.cc
index 2beb49241f2..49c8caeae18 100644
--- a/gcc/tree-profile.cc
+++ b/gcc/tree-profile.cc
@@ -73,6 +73,17 @@ static GTY(()) tree ic_tuple_var;
 static GTY(()) tree ic_tuple_counters_field;
 static GTY(()) tree ic_tuple_callee_field;
 
+/* If the user selected atomic profile counter updates
+   (-fprofile-update=atomic), then the counter updates will be done atomically.
+   Ideally, this is done through atomic operations in hardware.  If the
+   hardware supports only 32-bit atomic increments and gcov_type_node is a
+   64-bit integer type, then for the profile edge counters the increment is
+   performed through two separate 32-bit atomic increments.  This case is
+   indicated by the split_atomic_increment variable begin true.  If the
+   hardware does not support atomic 

Re: [PATCH 1/2] ivopts: Revert computation of address cost complexity.

2022-12-16 Thread Richard Biener via Gcc-patches
On Thu, Dec 15, 2022 at 4:26 PM Dimitrije Milosevic
 wrote:
>
> Hi Richard,
>
> Sorry for the delayed response, I couldn't find the time to fully focus on 
> this topic.
>
> > I'm not sure this is accurate but at least the cost of using an unsupported
> > addressing mode should be at least that of the compensating code to
> > mangle it to a supported form.
>
> I'm pretty sure IVOPTS does not filter out candidates which aren't supported 
> by
> the target architecture. It does, however, adjust the cost for a subset of 
> those.
> The adjustment code modifies only the cost part of the address cost (which
> consists of a cost and a complexity).
> Having said this, I'd propose two approaches:
> 1. Cover all cases of unsupported addressing modes (if needed, I'm not 
> entirely
> sure they aren't already covered), leaving complexity for unsupported
> addressing modes zero.

The only documentation on complexity I find is

  int64_t cost; /* The runtime cost.  */
  unsigned complexity;  /* The estimate of the complexity of the code for
   the computation (in no concrete units --
   complexity field should be larger for more
   complex expressions and addressing modes).  */

and complexity is used as tie-breaker only when cost is equal.  Given that
shouldn't unsupported addressing modes have higher complexity?  I'll note
that there's nothing "unsupported", each "unsupported" address computation
is lowered into supported pieces.  "unsupported" maybe means that
"cost" isn't fully covered by address-cost and compensation stmts might
be costed in quantities not fully compatible with that?

That said, "complexity" seems to only complicate things :/  We do have the
tie-breaker on prefering less IVs.  complexity was added in
r0-85562-g6e8c65f6621fb0 as part of fixing PR34711.

> 2. Revert the complexity calculation (which my initial patch does), 
> leaving
> everything else as it is.
> 3. A combination of both - if the control path gets into the adjustment 
> code, we
> use the reverted complexity calculation.

If it's really only about the "complexity" value then each
compensation step should
add to the complexity?

> I'd love to get feedback regarding this, so I could focus on a concrete 
> approach.
>
> Kind regards,
> Dimitrije
>
> From: Richard Biener 
> Sent: Monday, November 7, 2022 2:35 PM
> To: Dimitrije Milosevic 
> Cc: Jeff Law ; gcc-patches@gcc.gnu.org 
> ; Djordje Todorovic 
> Subject: Re: [PATCH 1/2] ivopts: Revert computation of address cost 
> complexity.
>
> On Wed, Nov 2, 2022 at 9:40 AM Dimitrije Milosevic
>  wrote:
> >
> > Hi Jeff,
> >
> > > This is exactly what I was trying to get to.   If the addressing mode
> > > isn't supported, then we shouldn't be picking it as a candidate.  If it
> > > is, then we've probably got a problem somewhere else in this code and
> > > this patch is likely papering over it.
>
> I'm not sure this is accurate but at least the cost of using an unsupported
> addressing mode should be at least that of the compensating code to
> mangle it to a supported form.
>
> > I'll take a deeper look into the candidate selection algorithm then. Will
> > get back to you.
>
> Thanks - as said the unfortunate situation is that both the original author 
> and
> the one who did the last bigger reworks of the code are gone.
>
> Richard.
>
> > Regards,
> > Dimitrije
> >
> > 
> > From: Jeff Law 
> > Sent: Tuesday, November 1, 2022 7:46 PM
> > To: Richard Biener; Dimitrije Milosevic
> > Cc: gcc-patches@gcc.gnu.org; Djordje Todorovic
> > Subject: Re: [PATCH 1/2] ivopts: Revert computation of address cost 
> > complexity.
> >
> >
> > On 10/28/22 01:00, Richard Biener wrote:
> > > On Fri, Oct 28, 2022 at 8:43 AM Dimitrije Milosevic
> > >  wrote:
> > >> Hi Jeff,
> > >>
> > >>> THe part I don't understand is, if you only have BASE+OFF, why does
> > >>> preventing the calculation of more complex addressing modes matter?  ie,
> > >>> what's the point of computing the cost of something like base + off +
> > >>> scaled index when the target can't utilize it?
> > >> Well, the complexities of all addressing modes other than BASE + OFFSET 
> > >> are
> > >> equal to 0. For targets like Mips, which only has BASE + OFFSET, it 
> > >> would still
> > >> be more complex to use a candidate with BASE + INDEX << SCALE + OFFSET
> > >> than a candidate with BASE + INDEX, for example, as it has to compensate
> > >> the lack of other addressing modes somehow. If complexities for both of
> > >> those are equal to 0, in cases where complexities decide which candidate 
> > >> is
> > >> to be chosen, a more complex candidate may be picked.
> > > But something is wrong then - it shouldn't ever pick a candidate with
> > > an addressing
> > > mode that isn't supported?  So you say that the cost of expressing
> > > 'BASE + INDEX << SCALE + OFFSET' as 'BASE + OFFSET' 

Re: [PATCH V2 1/2] x86: Don't add crtfastmath.o for -shared

2022-12-16 Thread Uros Bizjak via Gcc-patches
On Thu, Dec 15, 2022 at 7:23 AM liuhongt  wrote:
>
> Update in V2:
> Split -shared change into a separate commit and add some documentation
> for it.
> Bootstrapped and regtested on x86_64-pc-linu-gnu{-m32,}.
> Ok of trunk?
>
> Don't add crtfastmath.o for -shared to avoid changing the MXCSR register
> when loading a shared library.  crtfastmath.o will be used only when
> building executables.
>
>  PR target/55522
> * config/i386/gnu-user-common.h (GNU_USER_TARGET_MATHFILE_SPEC):
> Don't add crtfastmath.o for -shared.
> * doc/invoke.texi (-shared): Add related documentation.

OK.

Thanks,
Uros.

> ---
>  gcc/config/i386/gnu-user-common.h | 2 +-
>  gcc/doc/invoke.texi   | 3 ++-
>  2 files changed, 3 insertions(+), 2 deletions(-)
>
> diff --git a/gcc/config/i386/gnu-user-common.h 
> b/gcc/config/i386/gnu-user-common.h
> index cab9be2bfb7..9910cd64363 100644
> --- a/gcc/config/i386/gnu-user-common.h
> +++ b/gcc/config/i386/gnu-user-common.h
> @@ -47,7 +47,7 @@ along with GCC; see the file COPYING3.  If not see
>
>  /* Similar to standard GNU userspace, but adding -ffast-math support.  */
>  #define GNU_USER_TARGET_MATHFILE_SPEC \
> -  "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} \
> +  "%{Ofast|ffast-math|funsafe-math-optimizations:%{!shared:crtfastmath.o%s}} 
> \
> %{mpc32:crtprec32.o%s} \
> %{mpc64:crtprec64.o%s} \
> %{mpc80:crtprec80.o%s}"
> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> index cb40b38b73a..cba4f19f4f4 100644
> --- a/gcc/doc/invoke.texi
> +++ b/gcc/doc/invoke.texi
> @@ -17656,7 +17656,8 @@ needs to build supplementary stub code for 
> constructors to work.  On
>  multi-libbed systems, @samp{gcc -shared} must select the correct support
>  libraries to link against.  Failing to supply the correct flags may lead
>  to subtle defects.  Supplying them in cases where they are not necessary
> -is innocuous.}
> +is innocuous. For x86, crtfastmath.o will not be added when
> +@option{-shared} is specified. }
>
>  @item -shared-libgcc
>  @itemx -static-libgcc
> --
> 2.27.0
>


Re: [PATCH] gcov: Fix -fprofile-update=atomic

2022-12-16 Thread Richard Biener via Gcc-patches
On Thu, Dec 15, 2022 at 9:34 AM Sebastian Huber
 wrote:
>
> On 13/12/2022 15:30, Richard Biener wrote:
> > On Fri, Dec 9, 2022 at 2:56 PM Sebastian Huber
> >   wrote:
> >> The code coverage support uses counters to determine which edges in the 
> >> control
> >> flow graph were executed.  If a counter overflows, then the code coverage
> >> information is invalid.  Therefore the counter type should be a 64-bit 
> >> integer.
> >> In multithreaded applications, it is important that the counter increments 
> >> are
> >> atomic.  This is not the case by default.  The user can enable atomic 
> >> counter
> >> increments through the -fprofile-update=atomic and
> >> -fprofile-update=prefer-atomic options.
> >>
> >> If the hardware supports 64-bit atomic operations, then everything is 
> >> fine.  If
> >> not and -fprofile-update=prefer-atomic was chosen by the user, then 
> >> non-atomic
> >> counter increments will be used.  However, if the hardware does not 
> >> support the
> >> required atomic operations and -fprofile-atomic=update was chosen by the 
> >> user,
> >> then a warning was issued and as a forced fall-back to non-atomic 
> >> operations
> >> was done.  This is probably not what a user wants.  There is still 
> >> hardware on
> >> the market which does not have atomic operations and is used for 
> >> multithreaded
> >> applications.  A user which selects -fprofile-update=atomic wants 
> >> consistent
> >> code coverage data and not random data.
> >>
> >> This patch removes the fall-back to non-atomic operations for
> >> -fprofile-update=atomic.  If atomic operations in hardware are not 
> >> available,
> >> then a library call to libatomic is emitted.  To mitigate potential 
> >> performance
> >> issues an optimization for systems which only support 32-bit atomic 
> >> operations
> >> is provided.  Here, the edge counter increments are done like this:
> >>
> >>low = __atomic_add_fetch_4 (, 1, MEMMODEL_RELAXED);
> >>high_inc = low == 0 ? 1 : 0;
> >>__atomic_add_fetch_4 (, high_inc, MEMMODEL_RELAXED);
> > You check for compare_and_swapsi and the old code checks
> > TYPE_PRECISION (gcov_type_node) > 32 to determine whether 8 byte or 4 byte
> > gcov_type is used.  But you do not seem to handle the case where
> > neither SImode nor DImode atomic operations are available?  Can we instead
> > do
> >
> >if (gcov_type_size == 4)
> >  can_support_atomic4
> >= HAVE_sync_compare_and_swapsi || HAVE_atomic_compare_and_swapsi;
> >else if (gcov_type_size == 8)
> >  can_support_atomic8
> >= HAVE_sync_compare_and_swapdi || HAVE_atomic_compare_and_swapdi;
> >
> >if (flag_profile_update == PROFILE_UPDATE_ATOMIC
> >&& !can_support_atomic4 && !can_support_atomic8)
> >  {
> >warning (0, "target does not support atomic profile update, "
> > "single mode is selected");
> >flag_profile_update = PROFILE_UPDATE_SINGLE;
> >  }
> >
> > thus retain the diagnostic & fallback for this case?
>
> No, if you select -fprofile-update=atomic, then the updates shall be
> atomic from my point of view. If a fallback is acceptable, then you can
> use -fprofile-update=prefer-atomic. Using the fallback in
> -fprofile-update=atomic is a bug which prevents the use of gcov for
> multi-threaded applications on the lower end targets which do not have
> atomic operations in hardware.

Ah, OK.  So the fallback there is libatomic calls as well then?  Note
not all targets support libatomic, for those the failure mode is likely
a link error (which might be fine, but we eventually might want to
amend documentation to indicate this failure mode).

> > The existing
> > code also suggests
> > there might be targets with HImode or TImode counters?
>
> Sorry, I don't know.

can you conditionalize the split_atomic_increment init on
gcov_type_size == 8 please?

The patch is missing adjusments to the -fprofile-update documentation.
I think the prefer-atomic vs. atomic needs more explanation with this
change.

otherwise the patch looks OK to me.

Thanks,
Richard.

> >
> > In another mail you mentioned that for gen_time_profiler this doesn't
> > work but its
> > code relies on flag_profile_update as well.  So do we need to split the flag
> > somehow, or continue using the PROFILE_UPDATE_SINGLE fallback when
> > we are doing more than just edge profiling?
>
> If atomic operations are not available in hardware, then with this patch
> calls to libatomic are emitted. In gen_time_profiler() this is not an
> issue at all, since the atomic increment is only done if counters[0] ==
> 0 (if I read the code correctly).
>
> For example for
>
> void f(void) {}
>
> we get on riscv:
>
> gcc --coverage -O2 -S -o - test.c -fprofile-update=atomic
>
>  lui a4,%hi(__gcov0.f)
>  li  a3,1
>  addia4,a4,%lo(__gcov0.f)
>  amoadd.w a5,a3,0(a4)
>  lui a4,%hi(__gcov0.f+4)
>  addia5,a5,1
>  seqza5,a5
>  addi

[PATCH] middle-end/108086 - remove PR28238 fix superseeded by PR34018 fix

2022-12-16 Thread Richard Biener via Gcc-patches
There's quite special code in copy_bb that handles inline substitution
of a non-invariant address in place of an invariant one that's
now handled by more generic handling of this case in remap_gimple_op_r
so this removes the special casing that happens in a hot path, providing
a small speedup.

Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed.

PR middle-end/108086
* tree-inline.cc (copy_bb): Remove handling of (foo *)>m
substitution which is done in remap_gimple_op_r via
re-gimplifying.
---
 gcc/tree-inline.cc | 15 ---
 1 file changed, 15 deletions(-)

diff --git a/gcc/tree-inline.cc b/gcc/tree-inline.cc
index ad8275185ac..c802792fa07 100644
--- a/gcc/tree-inline.cc
+++ b/gcc/tree-inline.cc
@@ -2074,21 +2074,6 @@ copy_bb (copy_body_data *id, basic_block bb,
  gimple_duplicate_stmt_histograms (cfun, stmt, id->src_cfun,
orig_stmt);
 
- /* With return slot optimization we can end up with
-non-gimple (foo *)>m, fix that here.  */
- if (is_gimple_assign (stmt)
- && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt))
- && !is_gimple_val (gimple_assign_rhs1 (stmt)))
-   {
- tree new_rhs;
- new_rhs = force_gimple_operand_gsi (_gsi,
- gimple_assign_rhs1 (stmt),
- true, NULL, false,
- GSI_CONTINUE_LINKING);
- gimple_assign_set_rhs1 (stmt, new_rhs);
- id->regimplify = false;
-   }
-
  gsi_insert_after (_gsi, stmt, GSI_NEW_STMT);
 
  if (id->regimplify)
-- 
2.35.3


Re: [Patch] gcc-changelog/git_email.py: Support older unidiff.PatchSet

2022-12-16 Thread Martin Liška
On 12/16/22 10:18, Tobias Burnus wrote:
> Another backward compatibility issue - failed here on Ubuntu 20.04 which
> is old but not ancient.
> 
> OK for mainline?

It's fine, thanks for the workaround!

Martin

> 
> Tobias
> -
> Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
> München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
> Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
> München, HRB 106955



[Patch] gcc-changelog/git_email.py: Support older unidiff.PatchSet

2022-12-16 Thread Tobias Burnus

Another backward compatibility issue - failed here on Ubuntu 20.04 which
is old but not ancient.

OK for mainline?

Tobias
-
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
München, HRB 106955
gcc-changelog/git_email.py: Support older unidiff.PatchSet

Commit "unidiff: use newline='\n' argument",
r13-4603-gb045179973161115c7ea029b2788f5156fc55cda, added support CR
on a line, but that broke support for older unidiff.PatchSet.

This patch uses a fallback for git_email.py (drop argument) if not
available (TypeError exception) but keeps using it in test_email.py
unconditionally.

contrib/ChangeLog:

	* gcc-changelog/git_email.py (GitEmail:__init__): Support older
	unidiff.PatchSet that do not have a newline= argument
	of from_filename.

diff --git a/contrib/gcc-changelog/git_email.py b/contrib/gcc-changelog/git_email.py
index ef50ebfb7fd..093c887ba4c 100755
--- a/contrib/gcc-changelog/git_email.py
+++ b/contrib/gcc-changelog/git_email.py
@@ -39,7 +39,11 @@ unidiff_supports_renaming = hasattr(PatchedFile(), 'is_rename')
 class GitEmail(GitCommit):
 def __init__(self, filename):
 self.filename = filename
-diff = PatchSet.from_filename(filename, newline='\n')
+try:
+  diff = PatchSet.from_filename(filename, newline='\n')
+except TypeError:
+  # Older versions don't have the newline argument
+  diff = PatchSet.from_filename(filename)
 date = None
 author = None
 subject = ''


Re: Ping---[V3][PATCH 2/2] Add a new warning option -Wstrict-flex-arrays.

2022-12-16 Thread Richard Biener via Gcc-patches
On Thu, 15 Dec 2022, Qing Zhao wrote:

> 
> 
> > On Dec 15, 2022, at 2:47 AM, Richard Biener  wrote:
> > 
> > On Wed, 14 Dec 2022, Qing Zhao wrote:
> > 
> >> Hi, Richard,
> >> 
> >> I guess that we now agreed on the following:
> >> 
> >> “ the information that we ran into a trailing array but didn't consider 
> >> it a flex array because of -fstrict-flex-arrays is always a useful 
> >> information”
> >> 
> >> The only thing we didn’t decide is:
> >> 
> >> A. Amend such new information to -Warray-bounds when 
> >> -fstrict-flex-arrays=N (N>0) specified.
> >> 
> >> OR
> >> 
> >> B. Issue such new information with a new warning option 
> >> -Wstrict-flex-arrays when -fstrict-flex-arrays=N (N>0) specified.
> >> 
> >> My current patch implemented B. 
> > 
> > Plus it implements it to specify a different flex-array variant for
> > the extra diagnostic.
> Could you clarify a little bit on this? (Don’t quite understand…)
> > 
> >> If you think A is better, I will change the patch as A. 
> > 
> > I would tend to A since, as I said, it's useful information that
> > shouldn't be hidden and not adding an option removes odd combination
> > possibilities such as -Wno-array-bounds -Wstrict-flex-arrays.
> 
> With current implementation, the above combination will ONLY report the 
> misuse of trailing array as flex-array.  No out-of-bounds warnings 
> issued.
> 
> > In particular I find, say, -fstrict-flex-arrays=2 -Wstrict-flex-arrays=1
> > hardly useful.
> 
> The above combination will NOT happen, because there is NO level argument for 
> -Wstrict-flex-arrays.
> 
> The only combination will be:-fstrict-flex-arrays=N -Wstrict-flex-arrays
> 
> When N > 0, -Wstrict-flex-arrays will report any misuse of trailing arrays as 
> flexible array per the value of N.
> > 
> > But I'm interested in other opinions.
> 
> Adding a separate -Wstrict-flex-arrays will provide users a choice to ONLY 
> look at the mis-use of trailing arrays as flex-arrays.  Without this new 
> option, such information will be buried into tons of out-of-bounds messges. 
> 
> I think this is the major benefit to have one separate new warning 
> -Wstrict-flex-arrays. 
> 
> Do we need to provide the users this choice?

Ah, OK - I can see the value of auditing code this way before
enabling -fstrict-flex-arrays.

> +  if (opts->x_warn_strict_flex_arrays)
> +if (opts->x_flag_strict_flex_arrays == 0)
> +  {
> + opts->x_warn_strict_flex_arrays = 0;
> + warning_at (UNKNOWN_LOCATION, 0,
> + "%<-Wstrict-flex-arrays%> is ignored when"
> + " %<-fstrict-flex-arrays%> does not present");

"is not present".

The patch is OK with that change.

Thanks and sorry for the slow process ...

Richard.

> Thanks.
> 
> Qing
> > 
> > Thanks,
> > Richard.
> > 
> >> Let me know your opinion.
> >> 
> >> thanks.
> >> 
> >> Qing
> >> 
> >> 
> >>> On Dec 14, 2022, at 4:03 AM, Richard Biener  wrote:
> >>> 
> >>> On Tue, 13 Dec 2022, Qing Zhao wrote:
> >>> 
>  Richard, 
>  
>  Do you have any decision on this one? 
>  Do we need this warning option For GCC? 
> >>> 
> >>> Looking at the testcases it seems that the diagnostic amends
> >>> -Warray-bounds diagnostics for trailing but not flexible arrays?
> >>> Wouldn't it be better to generally diagnose this, so have
> >>> -Warray-bounds, with -fstrict-flex-arrays, for
> >>> 
> >>> struct X { int a[1]; };
> >>> int foo (struct X *p)
> >>> {
> >>> return p->a[1];
> >>> }
> >>> 
> >>> emit
> >>> 
> >>> warning: array subscript 1 is above array bounds ...
> >>> note: the trailing array is only a flexible array member with 
> >>> -fno-strict-flex-arrays
> >>> 
> >>> ?  Having -Wstrict-flex-arrays=N and N not agree with the
> >>> -fstrict-flex-arrays level sounds hardly useful to me but the
> >>> information that we ran into a trailing array but didn't consider
> >>> it a flex array because of -fstrict-flex-arrays is always a
> >>> useful information?
> >>> 
> >>> But maybe I misunderstood this new diagnostic?
> >>> 
> >>> Thanks,
> >>> Richard.
> >>> 
> >>> 
>  thanks.
>  
>  Qing
>  
> > On Dec 6, 2022, at 11:18 AM, Qing Zhao  wrote:
> > 
> > '-Wstrict-flex-arrays'
> >   Warn about inproper usages of flexible array members according to
> >   the LEVEL of the 'strict_flex_array (LEVEL)' attribute attached to
> >   the trailing array field of a structure if it's available,
> >   otherwise according to the LEVEL of the option
> >   '-fstrict-flex-arrays=LEVEL'.
> > 
> >   This option is effective only when LEVEL is bigger than 0.
> >   Otherwise, it will be ignored with a warning.
> > 
> >   when LEVEL=1, warnings will be issued for a trailing array
> >   reference of a structure that have 2 or more elements if the
> >   trailing array is referenced as a flexible array member.
> > 
> >   when LEVEL=2, in addition to LEVEL=1, additional warnings will be
> >   issued for a trailing one-element array 

Re: [PATCH] loop-invariant: Split preheader edge if the preheader bb ends with jump [PR106751]

2022-12-16 Thread Richard Biener via Gcc-patches
On Fri, 16 Dec 2022, Jakub Jelinek wrote:

> Hi!
> 
> The RTL loop passes only request simple preheaders, but don't require
> fallthru preheaders, while move_invariant_reg apparently assumes the
> latter, that it can just append instruction(s) to the end of the preheader
> basic block.
> 
> The following patch fixes that by splitting the preheader edge if
> the preheader bb ends with a JUMP_INSN (asm goto in this case).
> Without that we get control flow in the middle of a bb.
> 
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

OK.

> 2022-12-16  Jakub Jelinek  
> 
>   PR rtl-optimization/106751
>   * loop-invariant.cc (move_invariant_reg): If preheader bb ends
>   with a JUMP_INSN, split the preheader edge and emit invariants
>   into the new preheader basic block.
> 
>   * gcc.c-torture/compile/pr106751.c: New test.
> 
> --- gcc/loop-invariant.cc.jj  2022-01-18 11:58:59.683980614 +0100
> +++ gcc/loop-invariant.cc 2022-12-15 17:14:32.906883258 +0100
> @@ -1837,6 +1837,8 @@ move_invariant_reg (class loop *loop, un
>else if (dump_file)
>   fprintf (dump_file, "Invariant %d moved without introducing a new "
>   "temporary register\n", invno);
> +  if (JUMP_P (BB_END (preheader)))
> + preheader = split_edge (loop_preheader_edge (loop));
>reorder_insns (inv->insn, inv->insn, BB_END (preheader));
>df_recompute_luids (preheader);
>  
> --- gcc/testsuite/gcc.c-torture/compile/pr106751.c.jj 2022-12-15 
> 17:44:35.602519711 +0100
> +++ gcc/testsuite/gcc.c-torture/compile/pr106751.c2022-12-15 
> 17:44:21.789721827 +0100
> @@ -0,0 +1,17 @@
> +/* PR rtl-optimization/106751 */
> +
> +int *foo (void);
> +
> +void
> +bar (void)
> +{
> +  asm goto ("" : : : : lab);
> +  __builtin_unreachable ();
> +lab:
> +  while (1)
> +{
> +  int o;
> +  asm ("" : "=r" (o) : "g" (1));
> +  *foo () = o;
> +}
> +}
> 
>   Jakub
> 
> 

-- 
Richard Biener 
SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg,
Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
HRB 36809 (AG Nuernberg)


[PATCH] loop-invariant: Split preheader edge if the preheader bb ends with jump [PR106751]

2022-12-16 Thread Jakub Jelinek via Gcc-patches
Hi!

The RTL loop passes only request simple preheaders, but don't require
fallthru preheaders, while move_invariant_reg apparently assumes the
latter, that it can just append instruction(s) to the end of the preheader
basic block.

The following patch fixes that by splitting the preheader edge if
the preheader bb ends with a JUMP_INSN (asm goto in this case).
Without that we get control flow in the middle of a bb.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2022-12-16  Jakub Jelinek  

PR rtl-optimization/106751
* loop-invariant.cc (move_invariant_reg): If preheader bb ends
with a JUMP_INSN, split the preheader edge and emit invariants
into the new preheader basic block.

* gcc.c-torture/compile/pr106751.c: New test.

--- gcc/loop-invariant.cc.jj2022-01-18 11:58:59.683980614 +0100
+++ gcc/loop-invariant.cc   2022-12-15 17:14:32.906883258 +0100
@@ -1837,6 +1837,8 @@ move_invariant_reg (class loop *loop, un
   else if (dump_file)
fprintf (dump_file, "Invariant %d moved without introducing a new "
"temporary register\n", invno);
+  if (JUMP_P (BB_END (preheader)))
+   preheader = split_edge (loop_preheader_edge (loop));
   reorder_insns (inv->insn, inv->insn, BB_END (preheader));
   df_recompute_luids (preheader);
 
--- gcc/testsuite/gcc.c-torture/compile/pr106751.c.jj   2022-12-15 
17:44:35.602519711 +0100
+++ gcc/testsuite/gcc.c-torture/compile/pr106751.c  2022-12-15 
17:44:21.789721827 +0100
@@ -0,0 +1,17 @@
+/* PR rtl-optimization/106751 */
+
+int *foo (void);
+
+void
+bar (void)
+{
+  asm goto ("" : : : : lab);
+  __builtin_unreachable ();
+lab:
+  while (1)
+{
+  int o;
+  asm ("" : "=r" (o) : "g" (1));
+  *foo () = o;
+}
+}

Jakub



[PATCH] middle-end/108086 - more operand scanner reduction in inlining

2022-12-16 Thread Richard Biener via Gcc-patches
There's another round of redundant operand scanning in
remap_gimple_stmt.  The following removes this and also improves
one special-case to call a cheaper inline function.

Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed.

PR middle-end/108086
* tree-inline.cc (remap_gimple_stmt): Add stmts to the
sequence without updating them.  Simplify x == x detection.
---
 gcc/tree-inline.cc | 16 
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/gcc/tree-inline.cc b/gcc/tree-inline.cc
index 79897f41e86..ad8275185ac 100644
--- a/gcc/tree-inline.cc
+++ b/gcc/tree-inline.cc
@@ -1535,7 +1535,7 @@ remap_gimple_stmt (gimple *stmt, copy_body_data *id)
   if (id->reset_location)
gimple_set_location (bind, input_location);
   id->debug_stmts.safe_push (bind);
-  gimple_seq_add_stmt (, bind);
+  gimple_seq_add_stmt_without_update (, bind);
   return stmts;
 }
 
@@ -1765,7 +1765,7 @@ remap_gimple_stmt (gimple *stmt, copy_body_data *id)
 }
   else
 {
-  if (gimple_assign_copy_p (stmt)
+  if (gimple_assign_single_p (stmt)
  && gimple_assign_lhs (stmt) == gimple_assign_rhs1 (stmt)
  && auto_var_in_fn_p (gimple_assign_lhs (stmt), id->src_fn))
{
@@ -1833,7 +1833,7 @@ remap_gimple_stmt (gimple *stmt, copy_body_data *id)
  if (id->reset_location)
gimple_set_location (copy, input_location);
  id->debug_stmts.safe_push (copy);
- gimple_seq_add_stmt (, copy);
+ gimple_seq_add_stmt_without_update (, copy);
  return stmts;
}
   if (gimple_debug_source_bind_p (stmt))
@@ -1845,7 +1845,7 @@ remap_gimple_stmt (gimple *stmt, copy_body_data *id)
  if (id->reset_location)
gimple_set_location (copy, input_location);
  id->debug_stmts.safe_push (copy);
- gimple_seq_add_stmt (, copy);
+ gimple_seq_add_stmt_without_update (, copy);
  return stmts;
}
   if (gimple_debug_nonbind_marker_p (stmt))
@@ -1859,7 +1859,7 @@ remap_gimple_stmt (gimple *stmt, copy_body_data *id)
 
  gdebug *copy = as_a  (gimple_copy (stmt));
  id->debug_stmts.safe_push (copy);
- gimple_seq_add_stmt (, copy);
+ gimple_seq_add_stmt_without_update (, copy);
  return stmts;
}
 
@@ -1967,7 +1967,7 @@ remap_gimple_stmt (gimple *stmt, copy_body_data *id)
   !gsi_end_p (egsi);
   gsi_next ())
walk_gimple_op (gsi_stmt (egsi), remap_gimple_op_r, );
- gimple_seq_add_seq (, extra_stmts);
+ gimple_seq_add_seq_without_update (, extra_stmts);
}
 }
 
@@ -2006,14 +2006,14 @@ remap_gimple_stmt (gimple *stmt, copy_body_data *id)
 gimple_cond_code (cond),
 gimple_cond_lhs (cond),
 gimple_cond_rhs (cond));
-   gimple_seq_add_stmt (, cmp);
+   gimple_seq_add_stmt_without_update (, cmp);
gimple_cond_set_code (cond, NE_EXPR);
gimple_cond_set_lhs (cond, gimple_assign_lhs (cmp));
gimple_cond_set_rhs (cond, boolean_false_node);
  }
 }
 
-  gimple_seq_add_stmt (, copy);
+  gimple_seq_add_stmt_without_update (, copy);
   return stmts;
 }
 
-- 
2.35.3