[PATCH][_Hashtable] Fix merge

2023-10-18 Thread François Dumont

libstdc++: [_Hashtable] Do not reuse untrusted cached hash code

On merge reuse merged node cached hash code only if we are on the same 
type of
hash and this hash is stateless. Usage of function pointers or 
std::function as

hash functor will prevent this optimization.

libstdc++-v3/ChangeLog

    * include/bits/hashtable_policy.h
    (_Hash_code_base::_M_hash_code(const _Hash&, const 
_Hash_node_value<>&)): Remove.
    (_Hash_code_base::_M_hash_code<_H2>(const _H2&, const 
_Hash_node_value<>&)): Remove.

    * include/bits/hashtable.h
    (_M_src_hash_code<_H2>(const _H2&, const key_type&, const 
__node_value_type&)): New.

    (_M_merge_unique<>, _M_merge_multi<>): Use latter.
    * testsuite/23_containers/unordered_map/modifiers/merge.cc
    (test04, test05, test06): New test cases.

Tested under Linux x86_64, ok to commit ?

François

diff --git a/libstdc++-v3/include/bits/hashtable.h b/libstdc++-v3/include/bits/hashtable.h
index 4c12dc895b2..f69acfe5213 100644
--- a/libstdc++-v3/include/bits/hashtable.h
+++ b/libstdc++-v3/include/bits/hashtable.h
@@ -1109,6 +1109,20 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	return { __n, this->_M_node_allocator() };
   }
 
+  // Check and if needed compute hash code using _Hash as __n _M_hash_code,
+  // if present, was computed using _H2.
+  template
+	__hash_code
+	_M_src_hash_code(const _H2&, const key_type& __k,
+			 const __node_value_type& __src_n) const
+	{
+	  if constexpr (std::is_same_v<_H2, _Hash>)
+	if constexpr (std::is_empty_v<_Hash>)
+	  return this->_M_hash_code(__src_n);
+
+	  return this->_M_hash_code(__k);
+	}
+
 public:
   // Extract a node.
   node_type
@@ -1146,7 +1160,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	  auto __pos = __i++;
 	  const key_type& __k = _ExtractKey{}(*__pos);
 	  __hash_code __code
-		= this->_M_hash_code(__src.hash_function(), *__pos._M_cur);
+		= _M_src_hash_code(__src.hash_function(), __k, *__pos._M_cur);
 	  size_type __bkt = _M_bucket_index(__code);
 	  if (_M_find_node(__bkt, __k, __code) == nullptr)
 		{
@@ -1174,8 +1188,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	  for (auto __i = __src.cbegin(), __end = __src.cend(); __i != __end;)
 	{
 	  auto __pos = __i++;
+	  const key_type& __k = _ExtractKey{}(*__pos);
 	  __hash_code __code
-		= this->_M_hash_code(__src.hash_function(), *__pos._M_cur);
+		= _M_src_hash_code(__src.hash_function(), __k, *__pos._M_cur);
 	  auto __nh = __src.extract(__pos);
 	  __hint = _M_insert_multi_node(__hint, __code, __nh._M_ptr)._M_cur;
 	  __nh._M_ptr = nullptr;
diff --git a/libstdc++-v3/include/bits/hashtable_policy.h b/libstdc++-v3/include/bits/hashtable_policy.h
index 86b32fb15f2..5d162463dc3 100644
--- a/libstdc++-v3/include/bits/hashtable_policy.h
+++ b/libstdc++-v3/include/bits/hashtable_policy.h
@@ -1319,19 +1319,6 @@ namespace __detail
 	  return _M_hash()(__k);
 	}
 
-  __hash_code
-  _M_hash_code(const _Hash&,
-		   const _Hash_node_value<_Value, true>& __n) const
-  { return __n._M_hash_code; }
-
-  // Compute hash code using _Hash as __n _M_hash_code, if present, was
-  // computed using _H2.
-  template
-	__hash_code
-	_M_hash_code(const _H2&,
-		const _Hash_node_value<_Value, __cache_hash_code>& __n) const
-	{ return _M_hash_code(_ExtractKey{}(__n._M_v())); }
-
   __hash_code
   _M_hash_code(const _Hash_node_value<_Value, false>& __n) const
   { return _M_hash_code(_ExtractKey{}(__n._M_v())); }
diff --git a/libstdc++-v3/testsuite/23_containers/unordered_map/modifiers/merge.cc b/libstdc++-v3/testsuite/23_containers/unordered_map/modifiers/merge.cc
index b140ce452aa..c051b58137a 100644
--- a/libstdc++-v3/testsuite/23_containers/unordered_map/modifiers/merge.cc
+++ b/libstdc++-v3/testsuite/23_containers/unordered_map/modifiers/merge.cc
@@ -17,15 +17,29 @@
 
 // { dg-do run { target c++17 } }
 
+#include 
+#include 
 #include 
 #include 
 #include 
 
 using test_type = std::unordered_map;
 
-struct hash {
-  auto operator()(int i) const noexcept { return ~std::hash()(i); }
-};
+template
+  struct xhash
+  {
+auto operator()(const T& i) const noexcept
+{ return ~std::hash()(i); }
+  };
+
+
+namespace std
+{
+  template
+struct __is_fast_hash> : __is_fast_hash>
+{ };
+}
+
 struct equal : std::equal_to<> { };
 
 template
@@ -64,7 +78,7 @@ test02()
 {
   const test_type c0{ {1, 10}, {2, 20}, {3, 30} };
   test_type c1 = c0;
-  std::unordered_map c2( c0.begin(), c0.end() );
+  std::unordered_map, equal> c2( c0.begin(), c0.end() );
 
   c1.merge(c2);
   VERIFY( c1 == c0 );
@@ -89,7 +103,7 @@ test03()
 {
   const test_type c0{ {1, 10}, {2, 20}, {3, 30} };
   test_type c1 = c0;
-  std::unordered_multimap c2( c0.begin(), c0.end() );
+  std::unordered_multimap, equal> c2( c0.begin(), c0.end() );
   c1.merge(c2);
   VERIFY( c1 == c0 );
   VERIFY( equal_elements(c2, c0) );
@@ -125,10 +139,164 @@ test03()
   VERIFY( c2.empty() );
 }
 
+void
+test04()
+{
+  const 

[PATCH] aarch64: [PR110986] Emit csinv again for `a ? ~b : b`

2023-10-18 Thread Andrew Pinski
After r14-3110-g7fb65f10285, the canonical form for
`a ? ~b : b` changed to be `-(a) ^ b` that means
for aarch64 we need to add a few new insn patterns
to be able to catch this and change it to be
what is the canonical form for the aarch64 backend.
A secondary pattern was needed to support a zero_extended
form too; this adds a testcase for all 3 cases.

Bootstrapped and tested on aarch64-linux-gnu with no regressions.

PR target/110986

gcc/ChangeLog:

* config/aarch64/aarch64.md (*cmov_insn_insv): New pattern.
(*cmov_uxtw_insn_insv): Likewise.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/cond_op-1.c: New test.
---
 gcc/config/aarch64/aarch64.md| 46 
 gcc/testsuite/gcc.target/aarch64/cond_op-1.c | 20 +
 2 files changed, 66 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/cond_op-1.c

diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 32c7adc8928..59cd0415937 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -4413,6 +4413,52 @@ (define_insn "*csinv3_uxtw_insn3"
   [(set_attr "type" "csel")]
 )
 
+;; There are two canonical forms for `cmp ? ~a : a`.
+;; This is the second form and is here to help combine.
+;; Support `-(cmp) ^ a` into `cmp ? ~a : a`
+;; The second pattern is to support the zero extend'ed version.
+
+(define_insn_and_split "*cmov_insn_insv"
+  [(set (match_operand:GPI 0 "register_operand" "=r")
+(xor:GPI
+(neg:GPI
+ (match_operator:GPI 1 "aarch64_comparison_operator"
+  [(match_operand 2 "cc_register" "") (const_int 0)]))
+(match_operand:GPI 3 "general_operand" "r")))]
+  "can_create_pseudo_p ()"
+  "#"
+  "&& true"
+  [(set (match_dup 0)
+   (if_then_else:GPI (match_dup 1)
+ (not:GPI (match_dup 3))
+ (match_dup 3)))]
+  {
+operands[3] = force_reg (mode, operands[3]);
+  }
+  [(set_attr "type" "csel")]
+)
+
+(define_insn_and_split "*cmov_uxtw_insn_insv"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+(zero_extend:DI
+(xor:SI
+ (neg:SI
+  (match_operator:SI 1 "aarch64_comparison_operator"
+   [(match_operand 2 "cc_register" "") (const_int 0)]))
+ (match_operand:SI 3 "general_operand" "r"]
+  "can_create_pseudo_p ()"
+  "#"
+  "&& true"
+  [(set (match_dup 0)
+   (if_then_else:DI (match_dup 1)
+ (zero_extend:DI (not:SI (match_dup 3)))
+ (zero_extend:DI (match_dup 3]
+  {
+operands[3] = force_reg (SImode, operands[3]);
+  }
+  [(set_attr "type" "csel")]
+)
+
 ;; If X can be loaded by a single CNT[BHWD] instruction,
 ;;
 ;;A = UMAX (B, X)
diff --git a/gcc/testsuite/gcc.target/aarch64/cond_op-1.c 
b/gcc/testsuite/gcc.target/aarch64/cond_op-1.c
new file mode 100644
index 000..e6c7821127e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/cond_op-1.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* PR target/110986 */
+
+
+long long full(unsigned a, unsigned b)
+{
+  return a ? ~b : b;
+}
+unsigned fuu(unsigned a, unsigned b)
+{
+  return a ? ~b : b;
+}
+long long f(unsigned long long a, unsigned long long b)
+{
+  return a ? ~b : b;
+}
+
+/* { dg-final { scan-assembler-times "csinv\tw\[0-9\]*" 2 } } */
+/* { dg-final { scan-assembler-times "csinv\tx\[0-9\]*" 1 } } */
-- 
2.39.3



[PATCH 6/6] PowerPC: Add support for 1,024 bit DMR registers.

2023-10-18 Thread Michael Meissner
This patch is a prelimianry patch to add the full 1,024 bit dense math register
(DMRs) for -mcpu=future.  The MMA 512-bit accumulators map onto the top of the
DMR register.

This patch only adds the new 1,024 bit register support.  It does not add
support for any instructions that need 1,024 bit registers instead of 512 bit
registers.

I used the new mode 'TDOmode' to be the opaque mode used for 1,204 bit
registers.  The 'wD' constraint added in previous patches is used for these
registers.  I added support to do load and store of DMRs via the VSX registers,
since there are no load/store dense math instructions.  I added the new keyword
'__dmr' to create 1,024 bit types that can be loaded into DMRs.  At present, I
don't have aliases for __dmr512 and __dmr1024 that we've discussed internally.

The patches have been tested on both little and big endian systems.  Can I check
it into the master branch?

2023-10-18   Michael Meissner  

gcc/

* config/rs6000/mma.md (UNSPEC_DM_INSERT512_UPPER): New unspec.
(UNSPEC_DM_INSERT512_LOWER): Likewise.
(UNSPEC_DM_EXTRACT512): Likewise.
(UNSPEC_DMR_RELOAD_FROM_MEMORY): Likewise.
(UNSPEC_DMR_RELOAD_TO_MEMORY): Likewise.
(movtdo): New define_expand and define_insn_and_split to implement 1,024
bit DMR registers.
(movtdo_insert512_upper): New insn.
(movtdo_insert512_lower): Likewise.
(movtdo_extract512): Likewise.
(reload_dmr_from_memory): Likewise.
(reload_dmr_to_memory): Likewise.
* config/rs6000/rs6000-builtin.cc (rs6000_type_string): Add DMR
support.
(rs6000_init_builtins): Add support for __dmr keyword.
* config/rs6000/rs6000-call.cc (rs6000_return_in_memory): Add support
for TDOmode.
(rs6000_function_arg): Likewise.
* config/rs6000/rs6000-modes.def (TDOmode): New mode.
* config/rs6000/rs6000.cc (rs6000_hard_regno_nregs_internal): Add
support for TDOmode.
(rs6000_hard_regno_mode_ok_uncached): Likewise.
(rs6000_hard_regno_mode_ok): Likewise.
(rs6000_modes_tieable_p): Likewise.
(rs6000_debug_reg_global): Likewise.
(rs6000_setup_reg_addr_masks): Likewise.
(rs6000_init_hard_regno_mode_ok): Add support for TDOmode.  Setup reload
hooks for DMR mode.
(reg_offset_addressing_ok_p): Add support for TDOmode.
(rs6000_emit_move): Likewise.
(rs6000_secondary_reload_simple_move): Likewise.
(rs6000_secondary_reload_class): Likewise.
(rs6000_mangle_type): Add mangling for __dmr type.
(rs6000_dmr_register_move_cost): Add support for TDOmode.
(rs6000_split_multireg_move): Likewise.
(rs6000_invalid_conversion): Likewise.
* config/rs6000/rs6000.h (VECTOR_ALIGNMENT_P): Add TDOmode.
(enum rs6000_builtin_type_index): Add DMR type nodes.
(dmr_type_node): Likewise.
(ptr_dmr_type_node): Likewise.

gcc/testsuite/

* gcc.target/powerpc/dm-1024bit.c: New test.
---
 gcc/config/rs6000/mma.md  | 152 ++
 gcc/config/rs6000/rs6000-builtin.cc   |  13 ++
 gcc/config/rs6000/rs6000-call.cc  |  13 +-
 gcc/config/rs6000/rs6000-modes.def|   4 +
 gcc/config/rs6000/rs6000.cc   | 135 
 gcc/config/rs6000/rs6000.h|   7 +-
 gcc/testsuite/gcc.target/powerpc/dm-1024bit.c |  63 
 7 files changed, 351 insertions(+), 36 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/dm-1024bit.c

diff --git a/gcc/config/rs6000/mma.md b/gcc/config/rs6000/mma.md
index cae407bc37c..0a89db8af99 100644
--- a/gcc/config/rs6000/mma.md
+++ b/gcc/config/rs6000/mma.md
@@ -93,6 +93,11 @@ (define_c_enum "unspec"
UNSPEC_MMA_XXMTACC
UNSPEC_MMA_VECTOR_PAIR_MEMORY
UNSPEC_DM_ASSEMBLE_ACC
+   UNSPEC_DM_INSERT512_UPPER
+   UNSPEC_DM_INSERT512_LOWER
+   UNSPEC_DM_EXTRACT512
+   UNSPEC_DMR_RELOAD_FROM_MEMORY
+   UNSPEC_DMR_RELOAD_TO_MEMORY
   ])
 
 (define_c_enum "unspecv"
@@ -916,3 +921,150 @@ (define_insn "mma_"
   [(set_attr "type" "mma")
(set_attr "prefixed" "yes")
(set_attr "isa" "dm,not_dm,not_dm")])
+
+
+;; TDOmode (i.e. __dmr).
+(define_expand "movtdo"
+  [(set (match_operand:TDO 0 "nonimmediate_operand")
+   (match_operand:TDO 1 "input_operand"))]
+  "TARGET_DENSE_MATH"
+{
+  rs6000_emit_move (operands[0], operands[1], TDOmode);
+  DONE;
+})
+
+(define_insn_and_split "*movtdo"
+  [(set (match_operand:TDO 0 "nonimmediate_operand" "=wa,m,wa,wD,wD,wa")
+   (match_operand:TDO 1 "input_operand" "m,wa,wa,wa,wD,wD"))]
+  "TARGET_DENSE_MATH
+   && (gpc_reg_operand (operands[0], TDOmode)
+   || gpc_reg_operand (operands[1], TDOmode))"
+  "@
+   #
+   #
+   #
+   #
+   dmmr %0,%1
+   #"
+  "&& reload_completed
+   && (!dmr_operand (operands[0], TDOmode) || !dmr_operand (operands[1], 
TDOmode))"
+  [(const_int 0)]
+{
+  rtx op0 = operands[0];
+  rtx op1 = 

[PATCH 5/6] PowerPC: Switch to dense math names for all MMA operations.

2023-10-18 Thread Michael Meissner
This patch changes the assembler instruction names for MMA instructions from
the original name used in power10 to the new name when used with the dense math
system.  I.e. xvf64gerpp becomes dmxvf64gerpp.  The assembler will emit the
same bits for either spelling.

The patches have been tested on both little and big endian systems.  Can I check
it into the master branch?

2023-10-18   Michael Meissner  

gcc/

* config/rs6000/mma.md (vvi4i4i8_dm): New int attribute.
(avvi4i4i8_dm): Likewise.
(vvi4i4i2_dm): Likewise.
(avvi4i4i2_dm): Likewise.
(vvi4i4_dm): Likewise.
(avvi4i4_dm): Likewise.
(pvi4i2_dm): Likewise.
(apvi4i2_dm): Likewise.
(vvi4i4i4_dm): Likewise.
(avvi4i4i4_dm): Likewise.
(mma_): Add support for running on DMF systems, generating the dense
math instruction and using the dense math accumulators.
(mma_): Likewise.
(mma_): Likewise.
(mma_): Likewise.
(mma_): Likewise.
(mma_): Likewise.
(mma_): Likewise.
(mma_): Likewise.
(mma_): Likewise.
(mma_): Likewise.
(mma_): Likewise.
(mma_): Likewise.

gcc/testsuite/

* gcc.target/powerpc/dm-double-test.c: New test.
* lib/target-supports.exp (check_effective_target_ppc_dmr_ok): New
target test.
---
 gcc/config/rs6000/mma.md  |  98 +++--
 .../gcc.target/powerpc/dm-double-test.c   | 194 ++
 gcc/testsuite/lib/target-supports.exp |  19 ++
 3 files changed, 299 insertions(+), 12 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/dm-double-test.c

diff --git a/gcc/config/rs6000/mma.md b/gcc/config/rs6000/mma.md
index e5589d8eccc..cae407bc37c 100644
--- a/gcc/config/rs6000/mma.md
+++ b/gcc/config/rs6000/mma.md
@@ -228,13 +228,22 @@ (define_int_attr apv  [(UNSPEC_MMA_XVF64GERPP 
"xvf64gerpp")
 
 (define_int_attr vvi4i4i8  [(UNSPEC_MMA_PMXVI4GER8 "pmxvi4ger8")])
 
+(define_int_attr vvi4i4i8_dm   [(UNSPEC_MMA_PMXVI4GER8 
"pmdmxvi4ger8")])
+
 (define_int_attr avvi4i4i8 [(UNSPEC_MMA_PMXVI4GER8PP   
"pmxvi4ger8pp")])
 
+(define_int_attr avvi4i4i8_dm  [(UNSPEC_MMA_PMXVI4GER8PP   
"pmdmxvi4ger8pp")])
+
 (define_int_attr vvi4i4i2  [(UNSPEC_MMA_PMXVI16GER2"pmxvi16ger2")
 (UNSPEC_MMA_PMXVI16GER2S   "pmxvi16ger2s")
 (UNSPEC_MMA_PMXVF16GER2"pmxvf16ger2")
 (UNSPEC_MMA_PMXVBF16GER2   
"pmxvbf16ger2")])
 
+(define_int_attr vvi4i4i2_dm   [(UNSPEC_MMA_PMXVI16GER2"pmdmxvi16ger2")
+(UNSPEC_MMA_PMXVI16GER2S   
"pmdmxvi16ger2s")
+(UNSPEC_MMA_PMXVF16GER2"pmdmxvf16ger2")
+(UNSPEC_MMA_PMXVBF16GER2   
"pmdmxvbf16ger2")])
+
 (define_int_attr avvi4i4i2 [(UNSPEC_MMA_PMXVI16GER2PP  "pmxvi16ger2pp")
 (UNSPEC_MMA_PMXVI16GER2SPP 
"pmxvi16ger2spp")
 (UNSPEC_MMA_PMXVF16GER2PP  "pmxvf16ger2pp")
@@ -246,25 +255,54 @@ (define_int_attr avvi4i4i2
[(UNSPEC_MMA_PMXVI16GER2PP  "pmxvi16ger2pp")
 (UNSPEC_MMA_PMXVBF16GER2NP 
"pmxvbf16ger2np")
 (UNSPEC_MMA_PMXVBF16GER2NN 
"pmxvbf16ger2nn")])
 
+(define_int_attr avvi4i4i2_dm  [(UNSPEC_MMA_PMXVI16GER2PP  
"pmdmxvi16ger2pp")
+(UNSPEC_MMA_PMXVI16GER2SPP 
"pmdmxvi16ger2spp")
+(UNSPEC_MMA_PMXVF16GER2PP  
"pmdmxvf16ger2pp")
+(UNSPEC_MMA_PMXVF16GER2PN  
"pmdmxvf16ger2pn")
+(UNSPEC_MMA_PMXVF16GER2NP  
"pmdmxvf16ger2np")
+(UNSPEC_MMA_PMXVF16GER2NN  
"pmdmxvf16ger2nn")
+(UNSPEC_MMA_PMXVBF16GER2PP 
"pmdmxvbf16ger2pp")
+(UNSPEC_MMA_PMXVBF16GER2PN 
"pmdmxvbf16ger2pn")
+(UNSPEC_MMA_PMXVBF16GER2NP 
"pmdmxvbf16ger2np")
+(UNSPEC_MMA_PMXVBF16GER2NN 
"pmdmxvbf16ger2nn")])
+
 (define_int_attr vvi4i4[(UNSPEC_MMA_PMXVF32GER 
"pmxvf32ger")])
 
+(define_int_attr vvi4i4_dm [(UNSPEC_MMA_PMXVF32GER 
"pmdmxvf32ger")])
+
 (define_int_attr avvi4i4   [(UNSPEC_MMA_PMXVF32GERPP   "pmxvf32gerpp")
 (UNSPEC_MMA_PMXVF32GERPN   "pmxvf32gerpn")
 (UNSPEC_MMA_PMXVF32GERNP   "pmxvf32gernp")
 (UNSPEC_MMA_PMXVF32GERNN   
"pmxvf32gernn")])
 
+(define_int_attr avvi4i4_dm[(UNSPEC_MMA_PMXVF32GERPP   
"pmdmxvf32gerpp")
+(UNSPEC_MMA_PMXVF32GERPN   
"pmdmxvf32gerpn")
+ 

[PATCH 4/6] PowerPC: Make MMA insns support DMR registers.

2023-10-18 Thread Michael Meissner
This patch changes the MMA instructions to use either FPR registers
(-mcpu=power10) or DMRs (-mcpu=future).  In this patch, the existing MMA
instruction names are used.

A macro (__PPC_DMR__) is defined if the MMA instructions use the DMRs.

The patches have been tested on both little and big endian systems.  Can I check
it into the master branch?

2023-10-18   Michael Meissner  

gcc/

* config/rs6000/mma.md (mma_): New define_expand to handle
mma_ for dense math and non dense math.
(mma_ insn): Restrict to non dense math.
(mma_xxsetaccz): Convert to define_expand to handle non dense math and
dense math.
(mma_xxsetaccz_vsx): Rename from mma_xxsetaccz and restrict usage to non
dense math.
(mma_xxsetaccz_dm): Dense math version of mma_xxsetaccz.
(mma_): Add support for dense math.
(mma_): Likewise.
(mma_): Likewise.
(mma_): Likewise.
(mma_): Likewise.
(mma_): Likewise.
(mma_): Likewise.
(mma_): Likewise.
(mma_): Likewise.
(mma_): Likewise.
(mma_): Likewise.
(mma_): Likewise.
(mma_): Likewise.
(mma_): Likewise.
* config/rs6000/rs6000-c.cc (rs6000_target_modify_macros): Define
__PPC_DMR__ if we have dense math instructions.
* config/rs6000/rs6000.cc (print_operand): Make %A handle only DMRs if
dense math and only FPRs if not dense math.
(rs6000_split_multireg_move): Do not generate the xxmtacc instruction to
prime the DMR registers or the xxmfacc instruction to de-prime
instructions if we have dense math register support.
---
 gcc/config/rs6000/mma.md  | 247 +-
 gcc/config/rs6000/rs6000-c.cc |   3 +
 gcc/config/rs6000/rs6000.cc   |  35 ++---
 3 files changed, 176 insertions(+), 109 deletions(-)

diff --git a/gcc/config/rs6000/mma.md b/gcc/config/rs6000/mma.md
index d2c5b73fa8f..e5589d8eccc 100644
--- a/gcc/config/rs6000/mma.md
+++ b/gcc/config/rs6000/mma.md
@@ -596,190 +596,249 @@ (define_insn "*mma_disassemble_acc_dm"
   "dmxxextfdmr256 %0,%1,2"
   [(set_attr "type" "mma")])
 
-(define_insn "mma_"
+;; MMA instructions that do not use their accumulators as an input, still must
+;; not allow their vector operands to overlap the registers used by the
+;; accumulator.  We enforce this by marking the output as early clobber.  If we
+;; have dense math, we don't need the whole prime/de-prime action, so just make
+;; thse instructions be NOPs.
+
+(define_expand "mma_"
+  [(set (match_operand:XO 0 "register_operand")
+   (unspec:XO [(match_operand:XO 1 "register_operand")]
+  MMA_ACC))]
+  "TARGET_MMA"
+{
+  if (TARGET_DENSE_MATH)
+{
+  if (!rtx_equal_p (operands[0], operands[1]))
+   emit_move_insn (operands[0], operands[1]);
+  DONE;
+}
+
+  /* Generate the prime/de-prime code.  */
+})
+
+(define_insn "*mma_"
   [(set (match_operand:XO 0 "fpr_reg_operand" "=")
(unspec:XO [(match_operand:XO 1 "fpr_reg_operand" "0")]
MMA_ACC))]
-  "TARGET_MMA"
+  "TARGET_MMA && !TARGET_DENSE_MATH"
   " %A0"
   [(set_attr "type" "mma")])
 
 ;; We can't have integer constants in XOmode so we wrap this in an
-;; UNSPEC_VOLATILE.
+;; UNSPEC_VOLATILE for the non-dense math case.  For dense math, we don't need
+;; to disable optimization and we can do a normal UNSPEC.
 
-(define_insn "mma_xxsetaccz"
-  [(set (match_operand:XO 0 "fpr_reg_operand" "=d")
+(define_expand "mma_xxsetaccz"
+  [(set (match_operand:XO 0 "register_operand")
(unspec_volatile:XO [(const_int 0)]
UNSPECV_MMA_XXSETACCZ))]
   "TARGET_MMA"
+{
+  if (TARGET_DENSE_MATH)
+{
+  emit_insn (gen_mma_xxsetaccz_dm (operands[0]));
+  DONE;
+}
+})
+
+(define_insn "*mma_xxsetaccz_vsx"
+  [(set (match_operand:XO 0 "fpr_reg_operand" "=d")
+   (unspec_volatile:XO [(const_int 0)]
+   UNSPECV_MMA_XXSETACCZ))]
+  "TARGET_MMA && !TARGET_DENSE_MATH"
   "xxsetaccz %A0"
   [(set_attr "type" "mma")])
 
+
+(define_insn "mma_xxsetaccz_dm"
+  [(set (match_operand:XO 0 "dmr_operand" "=wD")
+   (unspec:XO [(const_int 0)]
+  UNSPECV_MMA_XXSETACCZ))]
+  "TARGET_DENSE_MATH"
+  "dmsetdmrz %0"
+  [(set_attr "type" "mma")])
+
 (define_insn "mma_"
-  [(set (match_operand:XO 0 "fpr_reg_operand" "=,")
-   (unspec:XO [(match_operand:V16QI 1 "vsx_register_operand" "v,?wa")
-   (match_operand:V16QI 2 "vsx_register_operand" "v,?wa")]
+  [(set (match_operand:XO 0 "accumulator_operand" "=wD,,")
+   (unspec:XO [(match_operand:V16QI 1 "vsx_register_operand" "wa,v,?wa")
+   (match_operand:V16QI 2 "vsx_register_operand" "wa,v,?wa")]
MMA_VV))]
   "TARGET_MMA"
   " %A0,%x1,%x2"
-  [(set_attr "type" "mma")])
+  [(set_attr "type" "mma")
+   (set_attr "isa" "dm,not_dm,not_dm")])
 
 (define_insn "mma_"
-  [(set (match_operand:XO 0 

[PATCH 3/6] PowerPC: Add support for accumulators in DMR registers.

2023-10-18 Thread Michael Meissner
The MMA subsystem added the notion of accumulator registers as an optional
feature of ISA 3.1 (power10).  In ISA 3.1, these accumulators overlapped with
the traditional floating point registers 0..31, but logically the accumulator
registers were separate from the FPR registers.  In ISA 3.1, it was anticipated
that in future systems, the accumulator registers may no overlap with the FPR
registers.  This patch adds the support for dense math registers as separate
registers.

This particular patch does not change the MMA support to use the accumulators
within the dense math registers.  This patch just adds the basic support for
having separate DMRs.  The next patch will switch the MMA support to use the
accumulators if -mcpu=future is used.

For testing purposes, I added an undocumented option '-mdense-math' to enable
or disable the dense math support.

This patch adds a new constraint (wD).  If MMA is selected but dense math is
not selected (i.e. -mcpu=power10), the wD constraint will allow access to
accumulators that overlap with the VSX vector registers 0..31.  If both MMA and
dense math are selected (i.e. -mcpu=future), the wD constraint will only allow
dense math registers.

This patch modifies the existing %A output modifier.  If MMA is selected but
dense math is not selected, then %A output modifier converts the VSX register
number to the accumulator number, by dividing it by 4.  If both MMA and dense
math are selected, then %A will map the separate DMR registers into 0..7.

The intention is that user code using extended asm can be modified to run on
both MMA without dense math and MMA with dense math:

1)  If possible, don't use extended asm, but instead use the MMA built-in
functions;

2)  If you do need to write extended asm, change the d constraints
targetting accumulators should now use wD;

3)  Only use the built-in zero, assemble and disassemble functions create
move data between vector quad types and dense math accumulators.
I.e. do not use the xxmfacc, xxmtacc, and xxsetaccz directly in the
extended asm code.  The reason is these instructions assume there is a
1-to-1 correspondence between 4 adjacent FPR registers and an
accumulator that overlaps with those instructions.  With accumulators
now being separate registers, there no longer is a 1-to-1
correspondence.

It is possible that the mangling for DMRs and the GDB register numbers may
change in the future.

The patches have been tested on both little and big endian systems.  Can I check
it into the master branch?

2023-10-18   Michael Meissner  

gcc/

* config/rs6000/constraints.md (wD constraint): New constraint.
* config/rs6000/mma.md (UNSPEC_DM_ASSEMBLE_ACC): New unspec.
(movxo): Convert into define_expand.
(movxo_vsx): Version of movxo where accumulators overlap with VSX vector
registers 0..31.
(movxo_dm): Verson of movxo that supports separate dense math
accumulators.
(mma_assemble_acc): Add dense math support to define_expand.
(mma_assemble_acc_vsx): Rename from mma_assemble_acc, and restrict it to
non dense math systems.
(mma_assemble_acc_dm): Dense math version of mma_assemble_acc.
(mma_disassemble_acc): Add dense math support to define_expand.
(mma_disassemble_acc_vsx): Rename from mma_disassemble_acc, and restrict
it to non dense math systems.
(mma_disassemble_acc_dm): Dense math version of mma_disassemble_acc.
* config/rs6000/predicates.md (dmr_operand): New predicate.
(accumulator_operand): Likewise.
* config/rs6000/rs6000-cpus.def (ISA_FUTURE_MASKS): Add -mdense-math.
(POWERPC_MASKS): Likewise.
* config/rs6000/rs6000.cc (enum rs6000_reg_type): Add DMR_REG_TYPE.
(enum rs6000_reload_reg_type): Add RELOAD_REG_DMR.
(LAST_RELOAD_REG_CLASS): Add support for DMR registers and the wD
constraint.
(reload_reg_map): Likewise.
(rs6000_reg_names): Likewise.
(alt_reg_names): Likewise.
(rs6000_hard_regno_nregs_internal): Likewise.
(rs6000_hard_regno_mode_ok_uncached): Likewise.
(rs6000_debug_reg_global): Likewise.
(rs6000_setup_reg_addr_masks): Likewise.
(rs6000_init_hard_regno_mode_ok): Likewise.
(rs6000_option_override_internal): Add checking for -mdense-math.
(rs6000_secondary_reload_memory): Add support for DMR registers.
(rs6000_secondary_reload_simple_move): Likewise.
(rs6000_preferred_reload_class): Likewise.
(rs6000_secondary_reload_class): Likewise.
(print_operand): Make %A handle both FPRs and DMRs.
(rs6000_dmr_register_move_cost): New helper function.
(rs6000_register_move_cost): Add support for DMR registers.
(rs6000_memory_move_cost): Likewise.
(rs6000_compute_pressure_classes): Likewise.
(rs6000_debugger_regno): 

[PATCH 2/6] PowerPC: Make -mcpu=future enable -mblock-ops-vector-pair.

2023-10-18 Thread Michael Meissner
This patch re-enables generating load and store vector pair instructions when
doing certain memory copy operations when -mcpu=future is used.

During power10 development, it was determined that using store vector pair
instructions were problematical in a few cases, so we disabled generating load
and store vector pair instructions for memory options by default.  This patch
re-enables generating these instructions if -mcpu=future is used.

The patches have been tested on both little and big endian systems.  Can I check
it into the master branch?

2023-10-18   Michael Meissner  

gcc/

* config/rs6000/rs6000-cpus.def (ISA_FUTURE_MASKS): Add
-mblock-ops-vector-pair.
(POWERPC_MASKS): Likewise.
---
 gcc/config/rs6000/rs6000-cpus.def | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/gcc/config/rs6000/rs6000-cpus.def 
b/gcc/config/rs6000/rs6000-cpus.def
index a6d9d7bf9a8..849af6b3ac8 100644
--- a/gcc/config/rs6000/rs6000-cpus.def
+++ b/gcc/config/rs6000/rs6000-cpus.def
@@ -90,6 +90,7 @@
 
 /* Flags for a potential future processor that may or may not be delivered.  */
 #define ISA_FUTURE_MASKS   (ISA_3_1_MASKS_SERVER   \
+| OPTION_MASK_BLOCK_OPS_VECTOR_PAIR\
 | OPTION_MASK_FUTURE)
 
 /* Flags that need to be turned off if -mno-power9-vector.  */
@@ -127,6 +128,7 @@
 
 /* Mask of all options to set the default isa flags based on -mcpu=.  */
 #define POWERPC_MASKS  (OPTION_MASK_ALTIVEC\
+| OPTION_MASK_BLOCK_OPS_VECTOR_PAIR\
 | OPTION_MASK_CMPB \
 | OPTION_MASK_CRYPTO   \
 | OPTION_MASK_DFP  \
-- 
2.41.0


-- 
Michael Meissner, IBM
PO Box 98, Ayer, Massachusetts, USA, 01432
email: meiss...@linux.ibm.com


Re: [PATCH 1/6] PowerPC: Add -mcpu=future option

2023-10-18 Thread Michael Meissner
This patch implements support for a potential future PowerPC cpu.  Features
added with -mcpu=future, may or may not be added to new PowerPC processors.

This patch adds support for the -mcpu=future option.  If you use -mcpu=future,
the macro __ARCH_PWR_FUTURE__ is defined, and the assembler .machine directive
"future" is used.  Future patches in this series will add support for new
instructions that may be present in future PowerPC processors.

This particular patch does not any new features.  It exists as a ground work
for future patches to support for a possible PowerPC processor in the future.

This patch does not implement any differences in tuning when -mcpu=future is
used compared to -mcpu=power10.  If -mcpu=future is used, GCC will use power10
tuning.  If you explicitly use -mtune=future, you will get a warning that
-mtune=future is not supported, and default tuning will be set for power10.

The patches have been tested on both little and big endian systems.  Can I check
it into the master branch?

2023-10-18   Michael Meissner  

gcc/

* config/rs6000/rs6000-c.cc (rs6000_target_modify_macros): Define
__ARCH_PWR_FUTURE__ if -mcpu=future.
* config/rs6000/rs6000-cpus.def (ISA_FUTURE_MASKS): New macro.
(POWERPC_MASKS): Add -mcpu=future support.
* config/rs6000/rs6000-opts.h (enum processor_type): Add
PROCESSOR_FUTURE.
* config/rs6000/rs6000-tables.opt: Regenerate.
* config/rs6000/rs6000.cc (rs600_cpu_index_lookup): New helper
function.
(rs6000_option_override_internal): Make -mcpu=future set
-mtune=power10.  If the user explicitly uses -mtune=future, give a
warning and reset the tuning to power10.
(rs6000_option_override_internal): Use power10 costs for future
machine.
(rs6000_machine_from_flags): Add support for -mcpu=future.
(rs6000_opt_masks): Likewise.
* config/rs6000/rs6000.h (ASM_CPU_SUPPORT): Likewise.
* config/rs6000/rs6000.md (cpu attribute): Likewise.
* config/rs6000/rs6000.opt (-mfuture): New undocumented debug switch.
* doc/invoke.texi (IBM RS/6000 and PowerPC Options): Document 
-mcpu=future.
---
 gcc/config/rs6000/rs6000-c.cc   |  2 +
 gcc/config/rs6000/rs6000-cpus.def   |  6 +++
 gcc/config/rs6000/rs6000-opts.h |  4 +-
 gcc/config/rs6000/rs6000-tables.opt |  3 ++
 gcc/config/rs6000/rs6000.cc | 58 -
 gcc/config/rs6000/rs6000.h  |  1 +
 gcc/config/rs6000/rs6000.md |  2 +-
 gcc/config/rs6000/rs6000.opt|  4 ++
 gcc/doc/invoke.texi |  2 +-
 9 files changed, 69 insertions(+), 13 deletions(-)

diff --git a/gcc/config/rs6000/rs6000-c.cc b/gcc/config/rs6000/rs6000-c.cc
index 65be0ac43e2..e276c20cccd 100644
--- a/gcc/config/rs6000/rs6000-c.cc
+++ b/gcc/config/rs6000/rs6000-c.cc
@@ -447,6 +447,8 @@ rs6000_target_modify_macros (bool define_p, HOST_WIDE_INT 
flags)
 rs6000_define_or_undefine_macro (define_p, "_ARCH_PWR9");
   if ((flags & OPTION_MASK_POWER10) != 0)
 rs6000_define_or_undefine_macro (define_p, "_ARCH_PWR10");
+  if ((flags & OPTION_MASK_FUTURE) != 0)
+rs6000_define_or_undefine_macro (define_p, "_ARCH_PWR_FUTURE");
   if ((flags & OPTION_MASK_SOFT_FLOAT) != 0)
 rs6000_define_or_undefine_macro (define_p, "_SOFT_FLOAT");
   if ((flags & OPTION_MASK_RECIP_PRECISION) != 0)
diff --git a/gcc/config/rs6000/rs6000-cpus.def 
b/gcc/config/rs6000/rs6000-cpus.def
index 8c530a22da8..a6d9d7bf9a8 100644
--- a/gcc/config/rs6000/rs6000-cpus.def
+++ b/gcc/config/rs6000/rs6000-cpus.def
@@ -88,6 +88,10 @@
 | OPTION_MASK_POWER10  \
 | OTHER_POWER10_MASKS)
 
+/* Flags for a potential future processor that may or may not be delivered.  */
+#define ISA_FUTURE_MASKS   (ISA_3_1_MASKS_SERVER   \
+| OPTION_MASK_FUTURE)
+
 /* Flags that need to be turned off if -mno-power9-vector.  */
 #define OTHER_P9_VECTOR_MASKS  (OPTION_MASK_FLOAT128_HW\
 | OPTION_MASK_P9_MINMAX)
@@ -134,6 +138,7 @@
 | OPTION_MASK_FPRND\
 | OPTION_MASK_POWER10  \
 | OPTION_MASK_P10_FUSION   \
+| OPTION_MASK_FUTURE   \
 | OPTION_MASK_HTM  \
 | OPTION_MASK_ISEL \
 | OPTION_MASK_LOAD_VECTOR_PAIR \
@@ -267,3 +272,4 @@ RS6000_CPU ("powerpc64", PROCESSOR_POWERPC64, 
OPTION_MASK_PPC_GFXOPT
 RS6000_CPU ("powerpc64le", PROCESSOR_POWER8, MASK_POWERPC64
| ISA_2_7_MASKS_SERVER | OPTION_MASK_HTM)
 RS6000_CPU ("rs64", PROCESSOR_RS64A, OPTION_MASK_PPC_GFXOPT | MASK_POWERPC64)

[PATCH 0/6] PowerPC Future patches

2023-10-18 Thread Michael Meissner
This patch is very preliminary support for a potential new feature to the
PowerPC that extends the current power10 MMA architecture.  This feature may or
may not be present in any specific future PowerPC processor.

In the current MMA subsystem for Power10, there are 8 512-bit accumulator
registers.  These accumulators are each tied to sets of 4 FPR registers.  When
you issue a prime instruction, it makes sure the accumulator is a copy of the 4
FPR registers the accumulator is tied to.  When you issue a deprime
instruction, it makes sure that the accumulator data content is logically
copied to the matching FPR register.

In the potential dense math system, the accumulators are moved to separate
registers called dense math registers (DM registers or DMR).  The DMRs are then
extended to 1,024 bits and new instructions will be added to deal with all
1,024 bits of the DMRs.

If you take existing MMA code, it will work as long as you don't do anything
with accumulators, and you follow the rules in the ISA 3.1 documentation for
using the MMA subsystem.

These patches add support for the 512-bit accumulators within the dense math
system, and for allocation of the 1,024-bit DMRs.  At this time, no additional
built-in functions will be done to support any dense math features other than
doing data movement between the DMRs and the VSX registers.  Before we can look
at adding any new dense math support other than data movement, we need the GCC
compiler to be able to allocate and use these DMRs.

There are 6 patches in this patch set:

1) The first patch just adds -mcpu=future as an option to add new support.
This is similar to the -mcpu=future that we did before power10 was announced.

2) The second patch enables GCC to use the load and store vector pair
instructions to optimize memory copy operations in the compiler.  For power10,
we needed to just stay with normal vector load/stores for memory copy
operations.

3) The third patch enables 512-bit accumulators that are located within in DMRs
instead of the FPRs.  This patch enables the register allocation, but it does
not move the existing MMA to use these registers.

4) The fourth patch switches the MMA subsystem to use 512-bit accumulators
within DMRs if you use -mcpu=future.

5) The fifth patch switches the names of the MMA instructions to use the dense
math equivalent name if -mcpu=future.

6) The sixth patch enables using the full 1,024-bit DMRs.  Right now, all you
can do with DMRs is move a VSX register to a DMR register, and to move a DMR
register to a VSX register.

In terms of changes, these patch now use the wD constraint for accumulators.
If you compile with -mcpu=power10, the wD constraint will match the equivalent
FPR register that overlaps with the accumulator.  If you compile with
-mcpu=future, the wD constraint will match the DMR register and not the FPR
register.

These patches also modifies the print_operand %A output modifier to print out
DMR register numbers if -mcpu=future, and continue to print out the FPR
register number divided by 4 for -mcpu=power10.

In general, if you only use the built-in functions, things work between the two
systems.  If you use extended asm, you will likely need to modify the code.
Going forward, hopefully if you modify your code to use the wD constraint and
%A output modifier, you can write code that switches more easily between the
two systems.

Again, these are preliminary patches for a potential future machine.  Things
will likely change in terms of implementation and usage over time.

Originally these patches were submitted in November 2022:
https://gcc.gnu.org/pipermail/gcc-patches/2022-November/605581.html

-- 
Michael Meissner, IBM
PO Box 98, Ayer, Massachusetts, USA, 01432
email: meiss...@linux.ibm.com


[COMMITTED] Fix expansion of `(a & 2) != 1`

2023-10-18 Thread Andrew Pinski
I had a thinko in r14-1600-ge60593f3881c72a96a3fa4844d73e8a2cd14f670
where we would remove the `& CST` part if we ended up not calling
expand_single_bit_test.
This fixes the problem by introducing a new variable that will be used
for calling expand_single_bit_test.
As afar as I know this can only show up when disabling optimization
passes as this above form would have been optimized away.

Committed as obvious after a bootstrap/test on x86_64-linux-gnu.

PR middle-end/111863

gcc/ChangeLog:

* expr.cc (do_store_flag): Don't over write arg0
when stripping off `& POW2`.

gcc/testsuite/ChangeLog:

* gcc.c-torture/execute/pr111863-1.c: New test.
---
 gcc/expr.cc  |  9 +
 gcc/testsuite/gcc.c-torture/execute/pr111863-1.c | 16 
 2 files changed, 21 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.c-torture/execute/pr111863-1.c

diff --git a/gcc/expr.cc b/gcc/expr.cc
index 8aed3fc6cbe..763bd82c59f 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -13206,14 +13206,15 @@ do_store_flag (sepops ops, rtx target, machine_mode 
mode)
  || integer_pow2p (arg1))
   && (TYPE_PRECISION (ops->type) != 1 || TYPE_UNSIGNED (ops->type)))
 {
-  wide_int nz = tree_nonzero_bits (arg0);
-  gimple *srcstmt = get_def_for_expr (arg0, BIT_AND_EXPR);
+  tree narg0 = arg0;
+  wide_int nz = tree_nonzero_bits (narg0);
+  gimple *srcstmt = get_def_for_expr (narg0, BIT_AND_EXPR);
   /* If the defining statement was (x & POW2), then use that instead of
 the non-zero bits.  */
   if (srcstmt && integer_pow2p (gimple_assign_rhs2 (srcstmt)))
{
  nz = wi::to_wide (gimple_assign_rhs2 (srcstmt));
- arg0 = gimple_assign_rhs1 (srcstmt);
+ narg0 = gimple_assign_rhs1 (srcstmt);
}
 
   if (wi::popcount (nz) == 1
@@ -13227,7 +13228,7 @@ do_store_flag (sepops ops, rtx target, machine_mode 
mode)
 
  type = lang_hooks.types.type_for_mode (mode, unsignedp);
  return expand_single_bit_test (loc, tcode,
-arg0,
+narg0,
 bitnum, type, target, mode);
}
 }
diff --git a/gcc/testsuite/gcc.c-torture/execute/pr111863-1.c 
b/gcc/testsuite/gcc.c-torture/execute/pr111863-1.c
new file mode 100644
index 000..4e27fe631b2
--- /dev/null
+++ b/gcc/testsuite/gcc.c-torture/execute/pr111863-1.c
@@ -0,0 +1,16 @@
+/* { dg-options " -fno-tree-ccp -fno-tree-dominator-opts -fno-tree-vrp" } */
+
+__attribute__((noipa))
+int f(int a)
+{
+a &= 2;
+return a != 1;
+}
+int main(void)
+{
+int t = f(1);
+if (!t)
+__builtin_abort();
+__builtin_printf("%d\n",t);
+return 0;
+}
-- 
2.39.3



[PATCH] c++: Make -Wunknown-pragmas controllable by #pragma GCC diagnostic [PR89038]

2023-10-18 Thread Lewis Hyatt
Hello-

The PR points out that my fix for PR53431 was incomplete and did not handle
-Wunknown-pragmas. This is a one-line fix to correct that, is it OK for
trunk and for GCC 13 backport please? bootstrap + regtest all languages on
x86-64 Linux. Thanks!

-Lewis

-- >8 --

As noted on the PR, commit r13-1544, the fix for PR53431, did not handle
the specific case of -Wunknown-pragmas, because that warning is issued
during preprocessing, but not by libcpp directly (it comes from the
cb_def_pragma callback).  Address that by handling this pragma in
addition to libcpp pragmas during the early pragma handler.

gcc/c-family/ChangeLog:

PR c++/89038
* c-pragma.cc (handle_pragma_diagnostic_impl):  Handle
-Wunknown-pragmas during early processing.

gcc/testsuite/ChangeLog:

PR c++/89038
* c-c++-common/cpp/Wunknown-pragmas-1.c: New test.
---
 gcc/c-family/c-pragma.cc|  3 ++-
 gcc/testsuite/c-c++-common/cpp/Wunknown-pragmas-1.c | 13 +
 2 files changed, 15 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/c-c++-common/cpp/Wunknown-pragmas-1.c

diff --git a/gcc/c-family/c-pragma.cc b/gcc/c-family/c-pragma.cc
index 293311dd4ce..98dfb0f108b 100644
--- a/gcc/c-family/c-pragma.cc
+++ b/gcc/c-family/c-pragma.cc
@@ -963,7 +963,8 @@ handle_pragma_diagnostic_impl ()
   /* option_string + 1 to skip the initial '-' */
   unsigned int option_index = find_opt (data.option_str + 1, lang_mask);
 
-  if (early && !c_option_is_from_cpp_diagnostics (option_index))
+  if (early && !(c_option_is_from_cpp_diagnostics (option_index)
+|| option_index == OPT_Wunknown_pragmas))
 return;
 
   if (option_index == OPT_SPECIAL_unknown)
diff --git a/gcc/testsuite/c-c++-common/cpp/Wunknown-pragmas-1.c 
b/gcc/testsuite/c-c++-common/cpp/Wunknown-pragmas-1.c
new file mode 100644
index 000..fb58739e2bc
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/cpp/Wunknown-pragmas-1.c
@@ -0,0 +1,13 @@
+/* PR c++/89038 */
+/* { dg-additional-options "-Wunknown-pragmas" } */
+
+#pragma oops /* { dg-warning "-:-Wunknown-pragmas" } */
+#pragma GGC diagnostic push /* { dg-warning "-:-Wunknown-pragmas" } */
+#pragma GCC diagnostics push /* { dg-warning "-:-Wunknown-pragmas" } */
+
+/* Test we can disable the warnings.  */
+#pragma GCC diagnostic ignored "-Wunknown-pragmas"
+
+#pragma oops /* { dg-bogus "-:-Wunknown-pragmas" } */
+#pragma GGC diagnostic push /* { dg-bogus "-:-Wunknown-pragmas" } */
+#pragma GCC diagnostics push /* { dg-bogus "-:-Wunknown-pragmas" } */


Re: [PATCH V2 7/7] aarch64: Add system register duplication check selftest

2023-10-18 Thread Richard Sandiford
Victor Do Nascimento  writes:
> Add a build-time test to check whether system register data, as
> imported from `aarch64-sys-reg.def' has any duplicate entries.
>
> Duplicate entries are defined as any two SYSREG entries in the .def
> file which share the same encoding values (as specified by its `CPENC'
> field) and where the relationship amongst the two does not fit into
> one of the following categories:
>
>   * Simple aliasing: In some cases, it is observed that one
>   register name serves as an alias to another.  One example of
>   this is where TRCEXTINSELR aliases TRCEXTINSELR0.
>   * Expressing intent: It is possible that when a given register
>   serves two distinct functions depending on how it is used, it
>   is given two distinct names whose use should match the context
>   under which it is being used.  Example:  Debug Data Transfer
>   Register. When used to receive data, it should be accessed as
>   DBGDTRRX_EL0 while when transmitting data it should be
>   accessed via DBGDTRTX_EL0.
>   * Register depreciation: Some register names have been
>   deprecated and should no longer be used, but backwards-
>   compatibility requires that such names continue to be
>   recognized, as is the case for the SPSR_EL1 register, whose
>   access via the SPSR_SVC name is now deprecated.
>   * Same encoding different target: Some encodings are given
>   different meaning depending on the target architecture and, as
>   such, are given different names in each of theses contexts.
>   We see an example of this for CPENC(3,4,2,0,0), which
>   corresponds to TTBR0_EL2 for Armv8-A targets and VSCTLR_EL2
>   in Armv8-R targets.
>
> A consequence of these observations is that `CPENC' duplication is
> acceptable iff at least one of the `properties' or `arch_reqs' fields
> of the `sysreg_t' structs associated with the two registers in
> question differ and it's this condition that is checked by the new
> `aarch64_test_sysreg_encoding_clashes' function.
>
> gcc/ChangeLog:
>
>   * gcc/config/aarch64/aarch64.cc
>   (aarch64_test_sysreg_encoding_clashes): New.
>   (aarch64_run_selftests): add call to
>   aarch64_test_sysreg_encoding_clashes selftest.
> ---
>  gcc/config/aarch64/aarch64.cc | 53 +++
>  1 file changed, 53 insertions(+)
>
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index d187e171beb..e0be2877ede 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -22,6 +22,7 @@
>  
>  #define INCLUDE_STRING
>  #define INCLUDE_ALGORITHM
> +#define INCLUDE_VECTOR
>  #include "config.h"
>  #include "system.h"
>  #include "coretypes.h"
> @@ -28332,6 +28333,57 @@ aarch64_test_fractional_cost ()
>ASSERT_EQ (cf (1, 2).as_double (), 0.5);
>  }
>  
> +/* Calculate whether our system register data, as imported from
> +   `aarch64-sys-reg.def' has any duplicate entries.  */
> +static void
> +aarch64_test_sysreg_encoding_clashes (void)
> +{
> +  using dup_counters_t = hash_map;
> +  using dup_instances_t = hash_map +std::vector>;
> +
> +  dup_counters_t duplicate_counts;
> +  dup_instances_t duplicate_instances;
> +
> +  /* Every time an encoding is established to come up more than once
> +  we add it to a "clash-analysis queue", which is then used to extract
> +  necessary information from our hash map when establishing whether
> +  repeated encodings are valid.  */

Formatting nit, sorry, but second and subsequent lines should be
indented to line up with the "E".

> +
> +  /* 1) Collect recurrence information.  */
> +  std::vector testqueue;
> +
> +  for (unsigned i = 0; i < nsysreg; i++)
> +{
> +  const sysreg_t *reg = sysreg_structs + i;
> +
> +  unsigned *tbl_entry = _counts.get_or_insert (reg->encoding);
> +  *tbl_entry += 1;
> +
> +  std::vector *tmp
> + = _instances.get_or_insert (reg->encoding);
> +
> +  tmp->push_back (reg);
> +  if (*tbl_entry > 1)
> +   testqueue.push_back (reg->encoding);
> +}

Do we need two hash maps here?  It looks like the length of the vector
is always equal to the count.  Also...

> +
> +  /* 2) Carry out analysis on collected data.  */
> +  for (auto enc : testqueue)

...hash_map itself is iterable.  We could iterate over that instead,
which would avoid the need for the queue.

> +{
> +  unsigned nrep = *duplicate_counts.get (enc);
> +  for (unsigned i = 0; i < nrep; i++)
> + for (unsigned j = i+1; j < nrep; j++)

Formatting nit, but "i + 1" rather than "i+1".

Overall, it looks like really nice work.  Thanks for doing this.

Richard

> +   {
> + std::vector *tmp2 = duplicate_instances.get (enc);
> + const sysreg_t *a = (*tmp2)[i];
> + const sysreg_t *b = (*tmp2)[j];
> + ASSERT_TRUE ((a->properties != b->properties)
> +  || (a->arch_reqs != 

Re: [PATCH V2 6/7] aarch64: Add front-end argument type checking for target builtins

2023-10-18 Thread Richard Sandiford
Victor Do Nascimento  writes:
> In implementing the ACLE read/write system register builtins it was
> observed that leaving argument type checking to be done at expand-time
> meant that poorly-formed function calls were being "fixed" by certain
> optimization passes, meaning bad code wasn't being properly picked up
> in checking.
>
> Example:
>
>   const char *regname = "amcgcr_el0";
>   long long a = __builtin_aarch64_rsr64 (regname);
>
> is reduced by the ccp1 pass to
>
>   long long a = __builtin_aarch64_rsr64 ("amcgcr_el0");
>
> As these functions require an argument of STRING_CST type, there needs
> to be a check carried out by the front-end capable of picking this up.
>
> The introduced `check_general_builtin_call' function will be called by
> the TARGET_CHECK_BUILTIN_CALL hook whenever a call to a builtin
> belonging to the AARCH64_BUILTIN_GENERAL category is encountered,
> carrying out any appropriate checks associated with a particular
> builtin function code.
>
> gcc/ChangeLog:
>
>   * gcc/config/aarch64/aarch64-builtins.cc (check_general_builtin_call):
>   New.
>   * gcc/config/aarch64/aarch64-c.cc (aarch64_check_builtin_call):
>   Add check_general_builtin_call call.
>   * gcc/config/aarch64/aarch64-protos.h (check_general_builtin_call):
>   New.
>
> gcc/testsuite/ChangeLog:
>
>   * gcc/testsuite/gcc.target/aarch64/acle/rwsr-2.c: New.
> ---
>  gcc/config/aarch64/aarch64-builtins.cc| 33 +++
>  gcc/config/aarch64/aarch64-c.cc   |  4 +--
>  gcc/config/aarch64/aarch64-protos.h   |  3 ++
>  .../gcc.target/aarch64/acle/rwsr-2.c  | 15 +
>  4 files changed, 53 insertions(+), 2 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/acle/rwsr-2.c
>
> diff --git a/gcc/config/aarch64/aarch64-builtins.cc 
> b/gcc/config/aarch64/aarch64-builtins.cc
> index d8bb2a989a5..6734361f4f4 100644
> --- a/gcc/config/aarch64/aarch64-builtins.cc
> +++ b/gcc/config/aarch64/aarch64-builtins.cc
> @@ -2126,6 +2126,39 @@ aarch64_general_builtin_decl (unsigned code, bool)
>return aarch64_builtin_decls[code];
>  }
>  
> +bool
> +check_general_builtin_call (location_t location, vec,
> + unsigned int code, tree fndecl,
> + unsigned int nargs ATTRIBUTE_UNUSED, tree *args)
> +{

How about aarch64_general_check_builtin_call?  It's better to use
aarch64_* prefixes where possible.

> +  switch (code)
> +{
> +case AARCH64_RSR:
> +case AARCH64_RSRP:
> +case AARCH64_RSR64:
> +case AARCH64_RSRF:
> +case AARCH64_RSRF64:
> +case AARCH64_WSR:
> +case AARCH64_WSRP:
> +case AARCH64_WSR64:
> +case AARCH64_WSRF:
> +case AARCH64_WSRF64:
> +  if (TREE_CODE (args[0]) == VAR_DECL
> +   || TREE_CODE (TREE_TYPE (args[0])) != POINTER_TYPE
> +   || TREE_CODE (TREE_OPERAND (TREE_OPERAND (args[0], 0) , 0))
> +   != STRING_CST)

Similarly to the expand code in 5/7, I think this should check
positively for specific tree codes rather than negatively for a
VAR_DECL.  That is, we should ensure TREE_CODE (x) is something
(rather than isn't something) before accessing TREE_OPERAND (x, 0).

> + {
> +   const char  *fn_name, *err_msg;
> +   fn_name = IDENTIFIER_POINTER (DECL_NAME (fndecl));
> +   err_msg = "first argument to %<%s%> must be a string literal";
> +   error_at (location, err_msg, fn_name);

The error message needs to remain part of the error_at call,
since being in error_at ensures that it gets picked up for translation.
It's simpler to use %qD rather than %<%s%>, and pass fndecl directly.

> +   return false;
> + }
> +}
> +  /* Default behavior.  */
> +  return true;
> +}
> +
>  typedef enum
>  {
>SIMD_ARG_COPY_TO_REG,
> diff --git a/gcc/config/aarch64/aarch64-c.cc b/gcc/config/aarch64/aarch64-c.cc
> index ab8844f6049..c2a9a59df73 100644
> --- a/gcc/config/aarch64/aarch64-c.cc
> +++ b/gcc/config/aarch64/aarch64-c.cc
> @@ -339,8 +339,8 @@ aarch64_check_builtin_call (location_t loc, 
> vec arg_loc,
>switch (code & AARCH64_BUILTIN_CLASS)
>  {
>  case AARCH64_BUILTIN_GENERAL:
> -  return true;
> -
> +  return check_general_builtin_call (loc, arg_loc, subcode, orig_fndecl,
> +  nargs, args);
>  case AARCH64_BUILTIN_SVE:
>return aarch64_sve::check_builtin_call (loc, arg_loc, subcode,
> orig_fndecl, nargs, args);
> diff --git a/gcc/config/aarch64/aarch64-protos.h 
> b/gcc/config/aarch64/aarch64-protos.h
> index a134e2fcf8e..9ef96ff511f 100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -990,6 +990,9 @@ tree aarch64_general_builtin_rsqrt (unsigned int);
>  void handle_arm_acle_h (void);
>  void handle_arm_neon_h (void);
>  
> +bool check_general_builtin_call (location_t, vec, unsigned int,
> +   tree, unsigned int, 

Re: [V3][PATCH 0/3] New attribute "counted_by" to annotate bounds for C99 FAM(PR108896)

2023-10-18 Thread Qing Zhao


> On Oct 5, 2023, at 4:08 PM, Siddhesh Poyarekar  wrote:
> 
> On 2023-08-25 11:24, Qing Zhao wrote:
>> This is the 3rd version of the patch, per our discussion based on the
>> review comments for the 1st and 2nd version, the major changes in this
>> version are:
> 
> Hi Qing,
> 
> I hope the review was helpful.  Overall, a couple of things to consider:
> 
> 1. How would you handle potential reordering between assignment of the size 
> to the counted_by field with the __bdos call that may consume it? You'll 
> probably need to express some kind of dependency there or in the worst case, 
> insert a barrier to disallow reordering.

Good point! 

So, your example in the respond to [V3][PATCH 2/3]Use the counted_by atribute 
info in builtin object size [PR108896]:
“
Maybe another test where the allocation, size assignment and __bdos call happen 
in the same function, where the allocator is not recognized by gcc:

void *
__attribute__ ((noinline))
alloc (size_t sz)
{
 return __builtin_malloc (sz);
}

void test (size_t sz)
{
 array_annotated = alloc (sz);
 array_annotated->b = sz;
 return __builtin_dynamic_object_size (array_annotated->c, 1);
}

The interesting thing to test (and ensure in the codegen) is that the 
assignment to array_annotated->b does not get reordered to below the 
__builtin_dynamic_object_size call since technically there is no data 
dependency between the two.
“
Will test on this. 

Not sure whether the current GCC alias analysis is able to distinguish one 
field of a structure from another field of the same structure, if YES, then
We need to add an explicit dependency edge from the write to 
“array_annotated->b” to the call to 
“__builtin_dynamic_object_size(array_annotated->c,1)”.
I will check on this and see how to resolve this issue.

I guess the possible solution is that we can add an implicit ref to 
“array_annotated->b” at the call to 
“__builtin_dynamic_object_size(array_annotated->c, 1)” if the counted_by 
attribute is available. That should resolve the issue.

Richard, what do you think on this?

> 
> 2. How would you handle signedness of the size field?  The size gets 
> converted to sizetype everywhere it is used and overflows/underflows may 
> produce interesting results.  Do you want to limit the types to unsigned or 
> do you want to add a disclaimer in the docs?  The former seems like the 
> *right* thing to do given that it is a new feature; best to enforce the 
> cleaner habit at the outset.

As I replied to Martin in another email, I plan to do the following to resolve 
this issue:

1. No specification for signed or unsigned for counted_by field.
2. Add a sanitizer option -fsanitize=counted-by-bound to catch the cases when 
the size of the counted-by is not positive.

Then, we will be consistent with the handling of VLA. 

So, I will not change anything for the current patch.
However, I will add the sanitizer option in a followup patch set.

Let me know your opinion.

thanks.

Qing

> 
> Thanks,
> Sid
> 
>> ***Against 1st version:
>> 1. change the name "element_count" to "counted_by";
>> 2. change the parameter for the attribute from a STRING to an
>> Identifier;
>> 3. Add logic and testing cases to handle anonymous structure/unions;
>> 4. Clarify documentation to permit the situation when the allocation
>> size is larger than what's specified by "counted_by", at the same time,
>> it's user's error if allocation size is smaller than what's specified by
>> "counted_by";
>> 5. Add a complete testing case for using counted_by attribute in
>> __builtin_dynamic_object_size when there is mismatch between the
>> allocation size and the value of "counted_by", the expecting behavior
>> for each case and the explanation on why in the comments.
>> ***Against 2rd version:
>> 1. Identify a tree node sharing issue and fixed it in the routine
>>"component_ref_get_counted_ty" of tree.cc;
>> 2. Update the documentation and testing cases with the clear usage
>>of the fomula to compute the allocation size:
>> MAX (sizeof (struct A), offsetof (struct A, array[0]) + counted_by * 
>> sizeof(element))
>>(the algorithm used in tree-object-size.cc is correct).
>> In this set of patches, the major functionality provided is:
>> 1. a new attribute "counted_by";
>> 2. use this new attribute in bound sanitizer;
>> 3. use this new attribute in dynamic object size for subobject size;
>> As discussed, I plan to add two more separate patches sets after this initial
>> patch set is approved and committed.
>> set 1. A new warning option and a new sanitizer option for the user error
>>   when the allocation size is smaller than the value of "counted_by".
>> set 2. An improvement to __builtin_dynamic_object_size  for whole-object
>>   size of the structure with FAM annaoted with counted_by.
>> there are also some existing bugs in tree-object-size.cc identified
>> during the study, and PRs were filed to record them. these bugs will
>> be fixed seperately with individual patches:
>> 

Re: [PATCH V2 4/7] aarch64: Add basic target_print_operand support for CONST_STRING

2023-10-18 Thread Richard Sandiford
Victor Do Nascimento  writes:
> Motivated by the need to print system register names in output
> assembly, this patch adds the required logic to
> `aarch64_print_operand' to accept rtxs of type CONST_STRING and
> process these accordingly.
>
> Consequently, an rtx such as:
>
>   (set (reg/i:DI 0 x0)
>  (unspec:DI [(const_string ("s3_3_c13_c2_2"))])
>
> can now be output correctly using the following output pattern when
> composing `define_insn's:
>
>   "mrs\t%x0, %1"
>
> gcc/ChangeLog
>
>   * gcc/config/aarch64/aarch64.cc (aarch64_print_operand): Add
>   support for CONST_STRING.
> ---
>  gcc/config/aarch64/aarch64.cc | 6 ++
>  1 file changed, 6 insertions(+)
>
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 816c4b69fc8..d187e171beb 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -12430,6 +12430,12 @@ aarch64_print_operand (FILE *f, rtx x, int code)
>  
>switch (GET_CODE (x))
>   {
> + case CONST_STRING:
> +   {
> + const char *output_op = XSTR (x, 0);
> + asm_fprintf (f, "%s", output_op);
> + break;
> +   }

LGTM, but it seems slightly neater to avoid the temporary:

case CONST_STRING:
  asm_fprintf (f, "%s", XSTR (x, 0));
  break;

(Sorry for the micro-comment.)

Thanks,
Richard

>   case REG:
> if (aarch64_sve_data_mode_p (GET_MODE (x)))
>   {


Re: [PATCH V2 2/7] aarch64: Add support for aarch64-sys-regs.def

2023-10-18 Thread Richard Sandiford
Victor Do Nascimento  writes:
> This patch defines the structure of a new .def file used for
> representing the aarch64 system registers, what information it should
> hold and the basic framework in GCC to process this file.
>
> Entries in the aarch64-system-regs.def file should be as follows:
>
>   SYSREG (NAME, CPENC (sn,op1,cn,cm,op2), FLAG1 | ... | FLAGn, ARCH)
>
> Where the arguments to SYSREG correspond to:
>   - NAME:  The system register name, as used in the assembly language.
>   - CPENC: The system register encoding, mapping to:
>
>  s__c_c_
>
>   - FLAG: The entries in the FLAGS field are bitwise-OR'd together to
> encode extra information required to ensure proper use of
> the system register.  For example, a read-only system
> register will have the flag F_REG_READ, while write-only
> registers will be labeled F_REG_WRITE.  Such flags are
> tested against at compile-time.
>   - ARCH: The architectural features the system register is associated
> with.  This is encoded via one of three possible macros:
> 1. When a system register is universally implemented, we say
> it has no feature requirements, so we tag it with the
> AARCH64_NO_FEATURES macro.
> 2. When a register is only implemented for a single
> architectural extension EXT, the AARCH64_FEATURE (EXT), is
> used.
> 3. When a given system register is made available by any of N
> possible architectural extensions, the AARCH64_FEATURES(N, ...)
> macro is used to combine them accordingly.
>
> In order to enable proper interpretation of the SYSREG entries by the
> compiler, flags defining system register behavior such as `F_REG_READ'
> and `F_REG_WRITE' are also defined here, so they can later be used for
> the validation of system register properties.
>
> Finally, any architectural feature flags from Binutils missing from GCC
> have appropriate aliases defined here so as to ensure
> cross-compatibility of SYSREG entries across the toolchain.
>
> gcc/ChangeLog:
>
>   * gcc/config/aarch64/aarch64.cc (sysreg_t): New.
>   (sysreg_structs): Likewise.
>   (nsysreg): Likewise.
>   (AARCH64_FEATURE): Likewise.
>   (AARCH64_FEATURES): Likewise.
>   (AARCH64_NO_FEATURES): Likewise.
>   * gcc/config/aarch64/aarch64.h (AARCH64_ISA_V8A): Add missing
>   ISA flag.
>   (AARCH64_ISA_V8_1A): Likewise.
>   (AARCH64_ISA_V8_7A): Likewise.
>   (AARCH64_ISA_V8_8A): Likewise.
>   (AARCH64_NO_FEATURES): Likewise.
>   (AARCH64_FL_RAS): New ISA flag alias.
>   (AARCH64_FL_LOR): Likewise.
>   (AARCH64_FL_PAN): Likewise.
>   (AARCH64_FL_AMU): Likewise.
>   (AARCH64_FL_SCXTNUM): Likewise.
>   (AARCH64_FL_ID_PFR2): Likewise.
>   (F_DEPRECATED): New.
>   (F_REG_READ): Likewise.
>   (F_REG_WRITE): Likewise.
>   (F_ARCHEXT): Likewise.
>   (F_REG_ALIAS): Likewise.
> ---
>  gcc/config/aarch64/aarch64.cc | 38 +++
>  gcc/config/aarch64/aarch64.h  | 36 +
>  2 files changed, 74 insertions(+)
>
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 9fbfc548a89..69de2366424 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -2807,6 +2807,44 @@ static const struct processor all_cores[] =
>{NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, NULL}
>  };
>  
> +typedef struct {
> +  const char* name;
> +  const char* encoding;

Formatting nit, but GCC style is:

  const char *foo

rather than:

  const char* foo;

> +  const unsigned properties;
> +  const unsigned long long arch_reqs;

I don't think these two should be const.  There's no reason in principle
why a sysreg_t can't be created and modified dynamically.

It would be useful to have some comments above the fields to say what
they represent.  E.g. the definition on its own doesn't make clear what
"properties" refers to.

arch_reqs should use aarch64_feature_flags rather than unsigned long long.
We're running out of feature flags in GCC too, so aarch64_feature_flags
is soon likely to be a C++ class.

> +} sysreg_t;
> +
> +/* An aarch64_feature_set initializer for a single feature,
> +   AARCH64_FEATURE_.  */
> +#define AARCH64_FEATURE(FEAT) AARCH64_FL_##FEAT
> +
> +/* Used by AARCH64_FEATURES.  */
> +#define AARCH64_OR_FEATURES_1(X, F1) \
> +  AARCH64_FEATURE (F1)
> +#define AARCH64_OR_FEATURES_2(X, F1, F2) \
> +  (AARCH64_FEATURE (F1) | AARCH64_OR_FEATURES_1 (X, F2))
> +#define AARCH64_OR_FEATURES_3(X, F1, ...) \
> +  (AARCH64_FEATURE (F1) | AARCH64_OR_FEATURES_2 (X, __VA_ARGS__))
> +
> +/* An aarch64_feature_set initializer for the N features listed in "...".  */
> +#define AARCH64_FEATURES(N, ...) \
> +  AARCH64_OR_FEATURES_##N (0, __VA_ARGS__)
> +
> +/* Database of system registers, their encodings and architectural
> +   requirements.  */
> +const sysreg_t 

Re: [PATCH] libcpp: testsuite: Add test for fixed _Pragma bug [PR82335]

2023-10-18 Thread Lewis Hyatt
May I please ping this one, and/or, is it something straightforward
enough I can just commit it as obvious? Thanks!
https://gcc.gnu.org/pipermail/gcc-patches/2023-October/631814.html

-Lewis

On Mon, Oct 2, 2023 at 6:23 PM Lewis Hyatt  wrote:
>
> Hello-
>
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82335 is another
> _Pragma-related bug that got fixed in GCC 12 but is still open. Before
> closing it out, I thought it would be good to add the testcase from that
> PR, which we don't have exactly in the testsuite already. Is it OK please?
> Thanks!
>
> -Lewis
>
> -- >8 --
>
> This PR was fixed by r12-4797 and r12-5454. Add test coverage from the PR
> that is not represented elsewhere.
>
> gcc/testsuite/ChangeLog:
>
> PR preprocessor/82335
> * c-c++-common/cpp/diagnostic-pragma-3.c: New test.
> ---
>  .../c-c++-common/cpp/diagnostic-pragma-3.c| 37 +++
>  1 file changed, 37 insertions(+)
>  create mode 100644 gcc/testsuite/c-c++-common/cpp/diagnostic-pragma-3.c
>
> diff --git a/gcc/testsuite/c-c++-common/cpp/diagnostic-pragma-3.c 
> b/gcc/testsuite/c-c++-common/cpp/diagnostic-pragma-3.c
> new file mode 100644
> index 000..459dcec73b3
> --- /dev/null
> +++ b/gcc/testsuite/c-c++-common/cpp/diagnostic-pragma-3.c
> @@ -0,0 +1,37 @@
> +/* This is like diagnostic-pragma-2.c, but handles the case where everything
> +   is wrapped inside a macro, which previously caused additional issues 
> tracked
> +   in PR preprocessor/82335.  */
> +
> +/* { dg-do compile } */
> +/* { dg-additional-options "-save-temps -Wattributes -Wtype-limits" } */
> +
> +#define B _Pragma("GCC diagnostic push") \
> +  _Pragma("GCC diagnostic ignored \"-Wattributes\"")
> +#define E _Pragma("GCC diagnostic pop")
> +
> +#define X() B int __attribute((unknown_attr)) x; E
> +#define Y   B int __attribute((unknown_attr)) y; E
> +#define WRAP(x) x
> +
> +void test1(void)
> +{
> +  WRAP(X())
> +  WRAP(Y)
> +}
> +
> +/* Additional test provided on the PR.  */
> +#define PRAGMA(...) _Pragma(#__VA_ARGS__)
> +#define PUSH_IGN(X) PRAGMA(GCC diagnostic push) PRAGMA(GCC diagnostic 
> ignored X)
> +#define POP() PRAGMA(GCC diagnostic pop)
> +#define TEST(X, Y) \
> +  PUSH_IGN("-Wtype-limits") \
> +  int Y = (__typeof(X))-1 < 0; \
> +  POP()
> +
> +int test2()
> +{
> +  unsigned x;
> +  TEST(x, i1);
> +  WRAP(TEST(x, i2))
> +  return i1 + i2;
> +}


Re: [V3][PATCH 2/3] Use the counted_by atribute info in builtin object size [PR108896]

2023-10-18 Thread Qing Zhao
Hi, Sid,

Thanks a lot for the detailed comments.

See my responds embedded below.

Qing

> On Oct 5, 2023, at 4:01 PM, Siddhesh Poyarekar  wrote:
> 
> 
> 
> On 2023-08-25 11:24, Qing Zhao wrote:
>> Use the counted_by atribute info in builtin object size to compute the
>> subobject size for flexible array members.
>> gcc/ChangeLog:
>>  PR C/108896
>>  * tree-object-size.cc (addr_object_size): Use the counted_by
>>  attribute info.
>>  * tree.cc (component_ref_has_counted_by_p): New function.
>>  (component_ref_get_counted_by): New function.
>>  * tree.h (component_ref_has_counted_by_p): New prototype.
>>  (component_ref_get_counted_by): New prototype.
>> gcc/testsuite/ChangeLog:
>>  PR C/108896
>>  * gcc.dg/flex-array-counted-by-2.c: New test.
>>  * gcc.dg/flex-array-counted-by-3.c: New test.
>> ---
>>  .../gcc.dg/flex-array-counted-by-2.c  |  74 ++
>>  .../gcc.dg/flex-array-counted-by-3.c  | 210 ++
>>  gcc/tree-object-size.cc   |  37 ++-
>>  gcc/tree.cc   |  95 +++-
>>  gcc/tree.h|  10 +
>>  5 files changed, 418 insertions(+), 8 deletions(-)
>>  create mode 100644 gcc/testsuite/gcc.dg/flex-array-counted-by-2.c
>>  create mode 100644 gcc/testsuite/gcc.dg/flex-array-counted-by-3.c
>> diff --git a/gcc/testsuite/gcc.dg/flex-array-counted-by-2.c 
>> b/gcc/testsuite/gcc.dg/flex-array-counted-by-2.c
>> new file mode 100644
>> index ..ec580c1f1f01
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.dg/flex-array-counted-by-2.c
>> @@ -0,0 +1,74 @@
>> +/* test the attribute counted_by and its usage in
>> + * __builtin_dynamic_object_size.  */
>> +/* { dg-do run } */
>> +/* { dg-options "-O2" } */
>> +
>> +#include "builtin-object-size-common.h"
>> +
>> +#define expect(p, _v) do { \
>> +size_t v = _v; \
>> +if (p == v) \
>> +__builtin_printf ("ok:  %s == %zd\n", #p, p); \
>> +else \
>> +{  \
>> +  __builtin_printf ("WAT: %s == %zd (expected %zd)\n", #p, p, v); \
>> +  FAIL (); \
>> +} \
>> +} while (0);
> 
> You're using this in a bunch of tests already; does it make sense to 
> consolidate it into builtin-object-size-common.h?
Will do this. 
> 
>> +
>> +struct flex {
>> +  int b;
>> +  int c[];
>> +} *array_flex;
>> +
>> +struct annotated {
>> +  int b;
>> +  int c[] __attribute__ ((counted_by (b)));
>> +} *array_annotated;
>> +
>> +struct nested_annotated {
>> +  struct {
>> +union {
>> +  int b;
>> +  float f;  
>> +};
>> +int n;
>> +  };
>> +  int c[] __attribute__ ((counted_by (b)));
>> +} *array_nested_annotated;
>> +
>> +void __attribute__((__noinline__)) setup (int normal_count, int attr_count)
>> +{
>> +  array_flex
>> += (struct flex *)malloc (sizeof (struct flex)
>> + + normal_count *  sizeof (int));
>> +  array_flex->b = normal_count;
>> +
>> +  array_annotated
>> += (struct annotated *)malloc (sizeof (struct annotated)
>> +  + attr_count *  sizeof (int));
>> +  array_annotated->b = attr_count;
>> +
>> +  array_nested_annotated
>> += (struct nested_annotated *)malloc (sizeof (struct nested_annotated)
>> + + attr_count *  sizeof (int));
>> +  array_nested_annotated->b = attr_count;
>> +
>> +  return;
>> +}
>> +
>> +void __attribute__((__noinline__)) test ()
>> +{
>> +expect(__builtin_dynamic_object_size(array_flex->c, 1), -1);
>> +expect(__builtin_dynamic_object_size(array_annotated->c, 1),
>> +   array_annotated->b * sizeof (int));
>> +expect(__builtin_dynamic_object_size(array_nested_annotated->c, 1),
>> +   array_nested_annotated->b * sizeof (int));
>> +}
> 
> Maybe another test where the allocation, size assignment and __bdos call 
> happen in the same function, where the allocator is not recognized by gcc:
> 
> void *
> __attribute__ ((noinline))
> alloc (size_t sz)
> {
>  return __builtin_malloc (sz);
> }
> 
> void test (size_t sz)
> {
>  array_annotated = alloc (sz);
>  array_annotated->b = sz;
>  return __builtin_dynamic_object_size (array_annotated->c, 1);
> }
> 
> The interesting thing to test (and ensure in the codegen) is that the 
> assignment to array_annotated->b does not get reordered to below the 
> __builtin_dynamic_object_size call since technically there is no data 
> dependency between the two.
Good point.
Will add such testing case. 
> 
>> +
>> +int main(int argc, char *argv[])
>> +{
>> +  setup (10,10);
>> +  test ();
>> +  DONE ();
>> +}
>> diff --git a/gcc/testsuite/gcc.dg/flex-array-counted-by-3.c 
>> b/gcc/testsuite/gcc.dg/flex-array-counted-by-3.c
>> new file mode 100644
>> index ..a0c3cb88ec71
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.dg/flex-array-counted-by-3.c
>> @@ -0,0 +1,210 @@
>> +/* test the attribute counted_by and its usage in
>> +__builtin_dynamic_object_size: what's the correct behavior when the
>> 

Re: [PATCH V2 5/7] aarch64: Implement system register r/w arm ACLE intrinsic functions

2023-10-18 Thread Richard Sandiford
Victor Do Nascimento  writes:
> Implement the aarch64 intrinsics for reading and writing system
> registers with the following signatures:
>
>   uint32_t __arm_rsr(const char *special_register);
>   uint64_t __arm_rsr64(const char *special_register);
>   void* __arm_rsrp(const char *special_register);
>   float __arm_rsrf(const char *special_register);
>   double __arm_rsrf64(const char *special_register);
>   void __arm_wsr(const char *special_register, uint32_t value);
>   void __arm_wsr64(const char *special_register, uint64_t value);
>   void __arm_wsrp(const char *special_register, const void *value);
>   void __arm_wsrf(const char *special_register, float value);
>   void __arm_wsrf64(const char *special_register, double value);
>
> gcc/ChangeLog:
>
>   * gcc/config/aarch64/aarch64-builtins.cc (enum aarch64_builtins):
>   Add enums for new builtins.
>   (aarch64_init_rwsr_builtins): New.
>   (aarch64_general_init_builtins): Call aarch64_init_rwsr_builtins.
>   (aarch64_expand_rwsr_builtin):  New.
>   (aarch64_general_expand_builtin): Call aarch64_general_expand_builtin.
>   * gcc/config/aarch64/aarch64.md (read_sysregdi): New insn_and_split.
>   (write_sysregdi): Likewise.
>   * gcc/config/aarch64/arm_acle.h (__arm_rsr): New.
>   (__arm_rsrp): Likewise.
>   (__arm_rsr64): Likewise.
>   (__arm_rsrf): Likewise.
>   (__arm_rsrf64): Likewise.
>   (__arm_wsr): Likewise.
>   (__arm_wsrp): Likewise.
>   (__arm_wsr64): Likewise.
>   (__arm_wsrf): Likewise.
>   (__arm_wsrf64): Likewise.
>
> gcc/testsuite/ChangeLog:
>
>   * gcc/testsuite/gcc.target/aarch64/acle/rwsr.c: New.
>   * gcc/testsuite/gcc.target/aarch64/acle/rwsr-1.c: Likewise.
> ---
>  gcc/config/aarch64/aarch64-builtins.cc| 200 ++
>  gcc/config/aarch64/aarch64.md |  17 ++
>  gcc/config/aarch64/arm_acle.h |  30 +++
>  .../gcc.target/aarch64/acle/rwsr-1.c  |  20 ++
>  gcc/testsuite/gcc.target/aarch64/acle/rwsr.c  | 144 +
>  5 files changed, 411 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/acle/rwsr-1.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/acle/rwsr.c
>
> diff --git a/gcc/config/aarch64/aarch64-builtins.cc 
> b/gcc/config/aarch64/aarch64-builtins.cc
> index 04f59fd9a54..d8bb2a989a5 100644
> --- a/gcc/config/aarch64/aarch64-builtins.cc
> +++ b/gcc/config/aarch64/aarch64-builtins.cc
> @@ -808,6 +808,17 @@ enum aarch64_builtins
>AARCH64_RBIT,
>AARCH64_RBITL,
>AARCH64_RBITLL,
> +  /* System register builtins.  */
> +  AARCH64_RSR,
> +  AARCH64_RSRP,
> +  AARCH64_RSR64,
> +  AARCH64_RSRF,
> +  AARCH64_RSRF64,
> +  AARCH64_WSR,
> +  AARCH64_WSRP,
> +  AARCH64_WSR64,
> +  AARCH64_WSRF,
> +  AARCH64_WSRF64,
>AARCH64_BUILTIN_MAX
>  };
>  
> @@ -1798,6 +1809,65 @@ aarch64_init_rng_builtins (void)
>  AARCH64_BUILTIN_RNG_RNDRRS);
>  }
>  
> +/* Add builtins for reading system register.  */
> +static void
> +aarch64_init_rwsr_builtins (void)
> +{
> +  tree fntype = NULL;
> +  tree const_char_ptr_type
> += build_pointer_type (build_type_variant (char_type_node, true, false));
> +
> +#define AARCH64_INIT_RWSR_BUILTINS_DECL(F, N, T) \
> +  aarch64_builtin_decls[AARCH64_##F] \
> += aarch64_general_add_builtin ("__builtin_aarch64_"#N, T, AARCH64_##F);
> +
> +  fntype
> += build_function_type_list (uint32_type_node, const_char_ptr_type, NULL);
> +  AARCH64_INIT_RWSR_BUILTINS_DECL (RSR, rsr, fntype);
> +
> +  fntype
> += build_function_type_list (ptr_type_node, const_char_ptr_type, NULL);
> +  AARCH64_INIT_RWSR_BUILTINS_DECL (RSRP, rsrp, fntype);
> +
> +  fntype
> += build_function_type_list (uint64_type_node, const_char_ptr_type, NULL);
> +  AARCH64_INIT_RWSR_BUILTINS_DECL (RSR64, rsr64, fntype);
> +
> +  fntype
> += build_function_type_list (float_type_node, const_char_ptr_type, NULL);
> +  AARCH64_INIT_RWSR_BUILTINS_DECL (RSRF, rsrf, fntype);
> +
> +  fntype
> += build_function_type_list (double_type_node, const_char_ptr_type, NULL);
> +  AARCH64_INIT_RWSR_BUILTINS_DECL (RSRF64, rsrf64, fntype);
> +
> +  fntype
> += build_function_type_list (void_type_node, const_char_ptr_type,
> + uint32_type_node, NULL);
> +
> +  AARCH64_INIT_RWSR_BUILTINS_DECL (WSR, wsr, fntype);
> +
> +  fntype
> += build_function_type_list (void_type_node, const_char_ptr_type,
> + const_ptr_type_node, NULL);
> +  AARCH64_INIT_RWSR_BUILTINS_DECL (WSRP, wsrp, fntype);
> +
> +  fntype
> += build_function_type_list (void_type_node, const_char_ptr_type,
> + uint64_type_node, NULL);
> +  AARCH64_INIT_RWSR_BUILTINS_DECL (WSR64, wsr64, fntype);
> +
> +  fntype
> += build_function_type_list (void_type_node, const_char_ptr_type,
> + float_type_node, NULL);
> +  

Re: [PATCH] RISC-V: Add popcount fallback expander.

2023-10-18 Thread Robin Dapp
> Could you by the way add this mention this PR: 
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111791
> Add the test of this PR ?

Commented in that PR.  This patch does not help there.

Regards
 Robin


Re: [PATCH v2] gcc: Introduce -fhardened

2023-10-18 Thread Qing Zhao
Marek,

Sorry for the late comment (I was just back from a long vacation immediate 
after Cauldron). 

One question:

Is the option “-fhandened” for production build or for development build? 

If it’s for development build, then adding -ftrivial-auto-var-init=pattern is 
reasonable since the major purpose for  -ftrivial-auto-var-init=pattern is for 
debugging, the runtime overhead of -ftrivial-auto-var-init=pattern is higher 
then -ftrivial-auto-var-init=zero.

However, if it’s for production build, then adding -ftrivial-auto-var-init=zero 
is better since the major purpose for -ftrivial-auto-var-init=zero is for 
production build to eliminate all uninitialization. And the runtime overhead of 
=zero is smaller than =pattern.

Qing
> On Oct 11, 2023, at 4:48 PM, Marek Polacek  wrote:
> 
> On Tue, Sep 19, 2023 at 10:58:19AM -0400, Marek Polacek wrote:
>> On Mon, Sep 18, 2023 at 08:57:39AM +0200, Richard Biener wrote:
>>> On Fri, Sep 15, 2023 at 5:09 PM Marek Polacek via Gcc-patches
>>>  wrote:
 
 Bootstrapped/regtested on x86_64-pc-linux-gnu, 
 powerpc64le-unknown-linux-gnu,
 and aarch64-unknown-linux-gnu; ok for trunk?
 
 -- >8 --
 In 
 I proposed -fhardened, a new umbrella option that enables a reasonable set
 of hardening flags.  The read of the room seems to be that the option
 would be useful.  So here's a patch implementing that option.
 
 Currently, -fhardened enables:
 
  -D_FORTIFY_SOURCE=3 (or =2 for older glibcs)
  -D_GLIBCXX_ASSERTIONS
  -ftrivial-auto-var-init=pattern
  -fPIE  -pie  -Wl,-z,relro,-z,now
  -fstack-protector-strong
  -fstack-clash-protection
  -fcf-protection=full (x86 GNU/Linux only)
 
 -fhardened will not override options that were specified on the command 
 line
 (before or after -fhardened).  For example,
 
 -D_FORTIFY_SOURCE=1 -fhardened
 
 means that _FORTIFY_SOURCE=1 will be used.  Similarly,
 
  -fhardened -fstack-protector
 
 will not enable -fstack-protector-strong.
 
 In DW_AT_producer it is reflected only as -fhardened; it doesn't expand
 to anything.  I think we need a better way to show what it actually
 enables.
>>> 
>>> I do think we need to find a solution here to solve asserting compliance.
>> 
>> Fair enough.
>> 
>>> Maybe we can have -Whardened that will diagnose any altering of
>>> -fhardened by other options on the command-line or by missed target
>>> implementations?  People might for example use -fstack-protector
>>> but don't really want to make protection lower than requested with 
>>> -fhardened.
>>> 
>>> Any such conflict is much less appearant than when you use the
>>> flags -fhardened composes.
>> 
>> How about: --help=hardened says which options -fhardened attempts to
>> enable, and -Whardened warns when it didn't enable an option?  E.g.,
>> 
>>  -fstack-protector -fhardened -Whardened
>> 
>> would say that it didn't enable -fstack-protector-strong because
>> -fstack-protector was specified on the command line?
>> 
>> If !HAVE_LD_NOW_SUPPORT, --help=hardened probably doesn't even have to
>> list -z now, likewise for -z relro.
>> 
>> Unclear if -Whardened should be enabled by default, but probably yes?
> 
> Here's v2 which adds -Whardened (enabled by default).
> 
> Bootstrapped/regtested on x86_64-pc-linux-gnu, ok for trunk?
> 
> -- >8 --
> In 
> I proposed -fhardened, a new umbrella option that enables a reasonable set
> of hardening flags.  The read of the room seems to be that the option
> would be useful.  So here's a patch implementing that option.
> 
> Currently, -fhardened enables:
> 
>  -D_FORTIFY_SOURCE=3 (or =2 for older glibcs)
>  -D_GLIBCXX_ASSERTIONS
>  -ftrivial-auto-var-init=pattern
>  -fPIE  -pie  -Wl,-z,relro,-z,now
>  -fstack-protector-strong
>  -fstack-clash-protection
>  -fcf-protection=full (x86 GNU/Linux only)
> 
> -fhardened will not override options that were specified on the command line
> (before or after -fhardened).  For example,
> 
> -D_FORTIFY_SOURCE=1 -fhardened
> 
> means that _FORTIFY_SOURCE=1 will be used.  Similarly,
> 
>  -fhardened -fstack-protector
> 
> will not enable -fstack-protector-strong.
> 
> In DW_AT_producer it is reflected only as -fhardened; it doesn't expand
> to anything.  This patch provides -Whardened, enabled by default, which
> warns when -fhardened couldn't enable a particular option.  I think most
> often it will say that _FORTIFY_SOURCE wasn't enabled because optimization
> were not enabled.
> 
> gcc/c-family/ChangeLog:
> 
>   * c-opts.cc (c_finish_options): Maybe cpp_define _FORTIFY_SOURCE
>   and _GLIBCXX_ASSERTIONS.
> 
> gcc/ChangeLog:
> 
>   * common.opt (Whardened, fhardened): New options.
>   * config.in: Regenerate.
>   * config/bpf/bpf.cc: Include "opts.h".
>   

Re: [V3][PATCH 0/3] New attribute "counted_by" to annotate bounds for C99 FAM(PR108896)

2023-10-18 Thread Qing Zhao


> On Oct 6, 2023, at 4:01 PM, Martin Uecker  wrote:
> 
> Am Freitag, dem 06.10.2023 um 06:50 -0400 schrieb Siddhesh Poyarekar:
>> On 2023-10-06 01:11, Martin Uecker wrote:
>>> Am Donnerstag, dem 05.10.2023 um 15:35 -0700 schrieb Kees Cook:
 On Thu, Oct 05, 2023 at 04:08:52PM -0400, Siddhesh Poyarekar wrote:
> 2. How would you handle signedness of the size field?  The size gets
> converted to sizetype everywhere it is used and overflows/underflows may
> produce interesting results.  Do you want to limit the types to unsigned 
> or
> do you want to add a disclaimer in the docs?  The former seems like the
> *right* thing to do given that it is a new feature; best to enforce the
> cleaner habit at the outset.
 
 The Linux kernel has a lot of "int" counters, so the goal is to catch
 negative offsets just like too-large offsets at runtime with the sanitizer
 and report 0 for __bdos. Refactoring all these to be unsigned is going
 to take time since at least some of them use the negative values as
 special values unrelated to array indexing. :(
 
 So, perhaps if unsigned counters are worth enforcing, can this be a
 separate warning the kernel can turn off initially?
 
>>> 
>>> I think unsigned counters are much more problematic than signed ones
>>> because wraparound errors are more difficult to find.
>>> 
>>> With unsigned you could potentially diagnose wraparound, but only if we
>>> add -fsanitize=unsigned-overflow *and* add mechanism to mark intentional
>>> wraparound *and* everybody adds this annotation after carefully screening
>>> their code *and* rewriting all operations such as (counter - 3) + 5
>>> where the wraparound in the intermediate expression is harmless.
>>> 
>>> For this reason, I do not think we should ever enforce some rule that
>>> the counter has to be unsigned.
>>> 
>>> What we could do, is detect *storing* negative values into the
>>> counter at run-time using UBSan. (but if negative values are
>>> used for special cases, one also should be able to turn this
>>> off).
>> 
>> All of the object size detection relies on object sizes being sizetype. 
>> The closest we could do with that is detect (sz != SIZE_MAX && sz > 
>> size_t / 2), since allocators typically cannot allocate more than 
>> SIZE_MAX / 2.
> 
> I was talking about the counter in:
> 
> struct {
>  int counter;
>  char buf[] __counted_by__((counter))
> };
> 
> which could be checked to be positive either when stored to or 
> when buf is used.
> 
> And yes, we could also check the size of buf.  Not sure what is
> done for VLAs now, but I guess it could be similar.
> 
For VLAs, the bounds expression could be both signed or unsigned. 
But we have added a sanitizer option -fsanitize=vla-bound to catch the cases 
when the size of the VLA is not positive.

For example:

opc@qinzhao-ol8u3-x86 Martin]$ cat t3.c
#include 
size_t foo(int m)
{
  char t[m];

  return sizeof(t);
}

int main()
{
  printf ("the sizeof flexm is %lu \n", foo(-1));
  return 0;
}
[opc@qinzhao-ol8u3-x86 Martin]$ sh t
/home/opc/Install/latest-d/bin/gcc -fsanitize=undefined -O2 -Wall -Wpedantic 
t3.c
t3.c:4:8: runtime error: variable length array bound evaluates to non-positive 
value -1
the sizeof flexm is 18446744073609551616 


We can do the same thing for “counted_by”. i.e:

1. No specification for signed or unsigned for counted_by field.
2. Add an sanitizer option -fsanitize=counted-by-bound to catch the cases when 
the size of the counted-by is not positive.

Is this good enough?

Qing
> Best,
> Martin
> 
> 
> 
>> 
>> Sid



Re: [PATCH V2 3/7] aarch64: Implement system register validation tools

2023-10-18 Thread Richard Sandiford
Generally looks really good.  Some comments below.

Victor Do Nascimento  writes:
> Given the implementation of a mechanism of encoding system registers
> into GCC, this patch provides the mechanism of validating their use by
> the compiler.  In particular, this involves:
>
>   1. Ensuring a supplied string corresponds to a known system
>  register name.  System registers can be accessed either via their
>  name (e.g. `SPSR_EL1') or their encoding (e.g. `S3_0_C4_C0_0').
>  Register names are validated using a hash map, mapping known
>  system register names to its corresponding `sysreg_t' struct,
>  which is populated from the `aarch64_system_regs.def' file.
>  Register name validation is done via `lookup_sysreg_map', while
>  the encoding naming convention is validated via a parser
>  implemented in this patch - `is_implem_def_reg'.
>   2. Once a given register name is deemed to be valid, it is checked
>  against a further 2 criteria:
>a. Is the referenced register implemented in the target
>   architecture?  This is achieved by comparing the ARCH field
> in the relevant SYSREG entry from `aarch64_system_regs.def'
> against `aarch64_feature_flags' flags set at compile-time.
>b. Is the register being used correctly?  Check the requested
> operation against the FLAGS specified in SYSREG.
> This prevents operations like writing to a read-only system
> register.
>
> gcc/ChangeLog:
>
>   * gcc/config/aarch64/aarch64-protos.h (aarch64_valid_sysreg_name_p): 
> New.
>   (aarch64_retrieve_sysreg): Likewise.
>   * gcc/config/aarch64/aarch64.cc (is_implem_def_reg): Likewise.
>   (aarch64_valid_sysreg_name_p): Likewise.
>   (aarch64_retrieve_sysreg): Likewise.
>   (aarch64_register_sysreg): Likewise.
>   (aarch64_init_sysregs): Likewise.
>   (aarch64_lookup_sysreg_map): Likewise.
>   * gcc/config/aarch64/predicates.md (aarch64_sysreg_string): New.
> ---
>  gcc/config/aarch64/aarch64-protos.h |   2 +
>  gcc/config/aarch64/aarch64.cc   | 146 
>  gcc/config/aarch64/predicates.md|   4 +
>  3 files changed, 152 insertions(+)
>
> diff --git a/gcc/config/aarch64/aarch64-protos.h 
> b/gcc/config/aarch64/aarch64-protos.h
> index 60a55f4bc19..a134e2fcf8e 100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -830,6 +830,8 @@ bool aarch64_simd_shift_imm_p (rtx, machine_mode, bool);
>  bool aarch64_sve_ptrue_svpattern_p (rtx, struct simd_immediate_info *);
>  bool aarch64_simd_valid_immediate (rtx, struct simd_immediate_info *,
>   enum simd_immediate_check w = AARCH64_CHECK_MOV);
> +bool aarch64_valid_sysreg_name_p (const char *);
> +const char *aarch64_retrieve_sysreg (char *, bool);
>  rtx aarch64_check_zero_based_sve_index_immediate (rtx);
>  bool aarch64_sve_index_immediate_p (rtx);
>  bool aarch64_sve_arith_immediate_p (machine_mode, rtx, bool);
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 69de2366424..816c4b69fc8 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -85,6 +85,7 @@
>  #include "config/arm/aarch-common.h"
>  #include "config/arm/aarch-common-protos.h"
>  #include "ssa.h"
> +#include "hash-map.h"
>  
>  /* This file should be included last.  */
>  #include "target-def.h"
> @@ -2845,6 +2846,52 @@ const sysreg_t sysreg_structs[] =
>  const unsigned nsysreg = TOTAL_ITEMS;
>  #undef TOTAL_ITEMS
>  
> +using sysreg_map_t = hash_map;
> +static sysreg_map_t *sysreg_map = nullptr;

One concern with static, non-GTY, runtime-initialised data is "does it
work with PCH?".  I suspect it does, since all uses of the map go through
aarch64_lookup_sysreg_map, and since nothing seems to rely on persistent
pointer values.  But it would be good to have a PCH test just to make sure.

I'm thinking of something like the tests in gcc/testsuite/gcc.dg/pch.
The header file (.hs) would define a function that does sysreg reads
and writes.  When the .hs is included from the .c file, the reads and
writes would be imported through a PCH load, rather than through the
normal frontend route.

> +
> +/* Map system register names to their hardware metadata: Encoding,

s/Encoding/encoding/

> +   feature flags and architectural feature requirements, all of which
> +   are encoded in a sysreg_t struct.  */
> +void
> +aarch64_register_sysreg (const char *name, const sysreg_t *metadata)
> +{
> +  bool dup = sysreg_map->put (name, metadata);
> +  gcc_checking_assert (!dup);
> +}
> +
> +/* Lazily initialize hash table for system register validation,
> +   checking the validity of supplied register name and returning
> +   register's associated metadata.  */
> +static void
> +aarch64_init_sysregs (void)
> +{
> +  gcc_assert (!sysreg_map);
> +  sysreg_map = new sysreg_map_t;
> +  gcc_assert (sysreg_map);

This assert seems redundant.  

Re: PR111648: Fix wrong code-gen due to incorrect VEC_PERM_EXPR folding

2023-10-18 Thread Prathamesh Kulkarni
On Wed, 18 Oct 2023 at 23:22, Richard Sandiford
 wrote:
>
> Prathamesh Kulkarni  writes:
> > On Tue, 17 Oct 2023 at 02:40, Richard Sandiford
> >  wrote:
> >> Prathamesh Kulkarni  writes:
> >> > diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc
> >> > index 4f8561509ff..55a6a68c16c 100644
> >> > --- a/gcc/fold-const.cc
> >> > +++ b/gcc/fold-const.cc
> >> > @@ -10684,9 +10684,8 @@ valid_mask_for_fold_vec_perm_cst_p (tree arg0, 
> >> > tree arg1,
> >> >
> >> >/* Ensure that the stepped sequence always selects from the same
> >> >input pattern.  */
> >> > -  unsigned arg_npatterns
> >> > - = ((q1 & 1) == 0) ? VECTOR_CST_NPATTERNS (arg0)
> >> > -   : VECTOR_CST_NPATTERNS (arg1);
> >> > +  tree arg = ((q1 & 1) == 0) ? arg0 : arg1;
> >> > +  unsigned arg_npatterns = VECTOR_CST_NPATTERNS (arg);
> >> >
> >> >if (!multiple_p (step, arg_npatterns))
> >> >   {
> >> > @@ -10694,6 +10693,29 @@ valid_mask_for_fold_vec_perm_cst_p (tree arg0, 
> >> > tree arg1,
> >> >   *reason = "step is not multiple of npatterns";
> >> > return false;
> >> >   }
> >> > +
> >> > +  /* If a1 chooses base element from arg, ensure that it's a natural
> >> > +  stepped sequence, ie, (arg[2] - arg[1]) == (arg[1] - arg[0])
> >> > +  to preserve arg's encoding.  */
> >> > +
> >> > +  unsigned HOST_WIDE_INT index;
> >> > +  if (!r1.is_constant ())
> >> > + return false;
> >> > +  if (index < arg_npatterns)
> >> > + {
> >>
> >> I don't know whether it matters in practice, but I think the two conditions
> >> above are more natural as:
> >>
> >> if (maybe_lt (r1, arg_npatterns))
> >>   {
> >> unsigned HOST_WIDE_INT index;
> >> if (!r1.is_constant ())
> >>   return false;
> >>
> >> ...[code below]...
> >>   }
> >>
> >> > +   tree arg_elem0 = vector_cst_elt (arg, index);
> >> > +   tree arg_elem1 = vector_cst_elt (arg, index + arg_npatterns);
> >> > +   tree arg_elem2 = vector_cst_elt (arg, index + arg_npatterns * 2);
> >> > +
> >> > +   if (!operand_equal_p (const_binop (MINUS_EXPR, arg_elem2, 
> >> > arg_elem1),
> >> > + const_binop (MINUS_EXPR, arg_elem1, 
> >> > arg_elem0),
> >> > + 0))
> >>
> >> This needs to check whether const_binop returns null.  Maybe:
> >>
> >>tree step1, step2;
> >>if (!(step1 = const_binop (MINUS_EXPR, arg_elem1, arg_elem0))
> >>|| !(step2 = const_binop (MINUS_EXPR, arg_elem2, arg_elem1))
> >>|| !operand_equal_p (step1, step2, 0))
> >>
> >> OK with those changes, thanks.
> > Hi Richard,
> > Thanks for the suggestions, updated the attached patch accordingly.
> > Bootstrapped+tested with and without SVE on aarch64-linux-gnu and
> > x86_64-linux-gnu.
> > OK to commit ?
>
> Yes, thanks.
Thanks, committed to trunk in 3ec8ecb8e92faec889bc6f7aeac9ff59e82b4f7f.

Thanks,
Prathamesh
>
> Richard
>
> >
> > Thanks,
> > Prathamesh
> >>
> >> Richard
> >>
> >> > + {
> >> > +   if (reason)
> >> > + *reason = "not a natural stepped sequence";
> >> > +   return false;
> >> > + }
> >> > + }
> >> >  }
> >> >
> >> >return true;
> >> > @@ -17161,7 +17183,8 @@ namespace test_fold_vec_perm_cst {
> >> >  static tree
> >> >  build_vec_cst_rand (machine_mode vmode, unsigned npatterns,
> >> >   unsigned nelts_per_pattern,
> >> > - int step = 0, int threshold = 100)
> >> > + int step = 0, bool natural_stepped = false,
> >> > + int threshold = 100)
> >> >  {
> >> >tree inner_type = lang_hooks.types.type_for_mode (GET_MODE_INNER 
> >> > (vmode), 1);
> >> >tree vectype = build_vector_type_for_mode (inner_type, vmode);
> >> > @@ -17176,17 +17199,28 @@ build_vec_cst_rand (machine_mode vmode, 
> >> > unsigned npatterns,
> >> >
> >> >// Fill a1 for each pattern
> >> >for (unsigned i = 0; i < npatterns; i++)
> >> > -builder.quick_push (build_int_cst (inner_type, rand () % 
> >> > threshold));
> >> > -
> >> > +{
> >> > +  tree a1;
> >> > +  if (natural_stepped)
> >> > + {
> >> > +   tree a0 = builder[i];
> >> > +   wide_int a0_val = wi::to_wide (a0);
> >> > +   wide_int a1_val = a0_val + step;
> >> > +   a1 = wide_int_to_tree (inner_type, a1_val);
> >> > + }
> >> > +  else
> >> > + a1 = build_int_cst (inner_type, rand () % threshold);
> >> > +  builder.quick_push (a1);
> >> > +}
> >> >if (nelts_per_pattern == 2)
> >> >  return builder.build ();
> >> >
> >> >for (unsigned i = npatterns * 2; i < npatterns * nelts_per_pattern; 
> >> > i++)
> >> >  {
> >> >tree prev_elem = builder[i - npatterns];
> >> > -  int prev_elem_val = TREE_INT_CST_LOW (prev_elem);
> >> > -  int val = prev_elem_val + step;
> >> > -  builder.quick_push (build_int_cst (inner_type, val));
> >> > +  wide_int 

Re: [PATCH 10/11] aarch64: Generalise TFmode load/store pair patterns

2023-10-18 Thread Richard Sandiford
Alex Coplan  writes:
> This patch generalises the TFmode load/store pair patterns to TImode and
> TDmode.  This brings them in line with the DXmode patterns, and uses the
> same technique with separate mode iterators (TX and TX2) to allow for
> distinct modes in each arm of the load/store pair.
>
> For example, in combination with the post-RA load/store pair fusion pass
> in the following patch, this improves the codegen for the following
> varargs testcase involving TImode stores:
>
> void g(void *);
> int foo(int x, ...)
> {
> __builtin_va_list ap;
> __builtin_va_start (ap, x);
> g();
> __builtin_va_end (ap);
> }
>
> from:
>
> foo:
> .LFB0:
>   stp x29, x30, [sp, -240]!
> .LCFI0:
>   mov w9, -56
>   mov w8, -128
>   mov x29, sp
>   add x10, sp, 176
>   stp x1, x2, [sp, 184]
>   add x1, sp, 240
>   add x0, sp, 16
>   stp x1, x1, [sp, 16]
>   str x10, [sp, 32]
>   stp w9, w8, [sp, 40]
>   str q0, [sp, 48]
>   str q1, [sp, 64]
>   str q2, [sp, 80]
>   str q3, [sp, 96]
>   str q4, [sp, 112]
>   str q5, [sp, 128]
>   str q6, [sp, 144]
>   str q7, [sp, 160]
>   stp x3, x4, [sp, 200]
>   stp x5, x6, [sp, 216]
>   str x7, [sp, 232]
>   bl  g
>   ldp x29, x30, [sp], 240
> .LCFI1:
>   ret
>
> to:
>
> foo:
> .LFB0:
>   stp x29, x30, [sp, -240]!
> .LCFI0:
>   mov w9, -56
>   mov w8, -128
>   mov x29, sp
>   add x10, sp, 176
>   stp x1, x2, [sp, 1bd4971b7c71e70a637a1dq84]
>   add x1, sp, 240
>   add x0, sp, 16
>   stp x1, x1, [sp, 16]
>   str x10, [sp, 32]
>   stp w9, w8, [sp, 40]
>   stp q0, q1, [sp, 48]
>   stp q2, q3, [sp, 80]
>   stp q4, q5, [sp, 112]
>   stp q6, q7, [sp, 144]
>   stp x3, x4, [sp, 200]
>   stp x5, x6, [sp, 216]
>   str x7, [sp, 232]
>   bl  g
>   ldp x29, x30, [sp], 240
> .LCFI1:
>   ret
>
> Note that this patch isn't needed if we only use the mode
> canonicalization approach in the new ldp fusion pass (since we
> canonicalize T{I,F,D}mode to V16QImode), but we seem to get slightly
> better performance with mode canonicalization disabled (see
> --param=aarch64-ldp-canonicalize-modes in the following patch).
>
> Bootstrapped/regtested as a series on aarch64-linux-gnu, OK for trunk?
>
> gcc/ChangeLog:
>
>   * config/aarch64/aarch64.md (load_pair_dw_tftf): Rename to ...
>   (load_pair_dw_): ... this.
>   (store_pair_dw_tftf): Rename to ...
>   (store_pair_dw_): ... this.
>   * config/aarch64/iterators.md (TX2): New.

OK, thanks.  It would be nice to investigate & fix the reasons for
the regressions with canonicalised modes, but I agree that this patch
is a strict improvement, since it fixes a hole in the current scheme.

Richard

> ---
>  gcc/config/aarch64/aarch64.md   | 22 +++---
>  gcc/config/aarch64/iterators.md |  3 +++
>  2 files changed, 14 insertions(+), 11 deletions(-)
>
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 32c7adc8928..e6af09c2e8b 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -1757,16 +1757,16 @@ (define_insn "load_pair_dw_"
>}
>  )
>  
> -(define_insn "load_pair_dw_tftf"
> -  [(set (match_operand:TF 0 "register_operand" "=w")
> - (match_operand:TF 1 "aarch64_mem_pair_operand" "Ump"))
> -   (set (match_operand:TF 2 "register_operand" "=w")
> - (match_operand:TF 3 "memory_operand" "m"))]
> +(define_insn "load_pair_dw_"
> +  [(set (match_operand:TX 0 "register_operand" "=w")
> + (match_operand:TX 1 "aarch64_mem_pair_operand" "Ump"))
> +   (set (match_operand:TX2 2 "register_operand" "=w")
> + (match_operand:TX2 3 "memory_operand" "m"))]
> "TARGET_SIMD
>  && rtx_equal_p (XEXP (operands[3], 0),
>   plus_constant (Pmode,
>  XEXP (operands[1], 0),
> -GET_MODE_SIZE (TFmode)))"
> +GET_MODE_SIZE (mode)))"
>"ldp\\t%q0, %q2, %z1"
>[(set_attr "type" "neon_ldp_q")
> (set_attr "fp" "yes")]
> @@ -1805,11 +1805,11 @@ (define_insn "store_pair_dw_"
>}
>  )
>  
> -(define_insn "store_pair_dw_tftf"
> -  [(set (match_operand:TF 0 "aarch64_mem_pair_operand" "=Ump")
> - (match_operand:TF 1 "register_operand" "w"))
> -   (set (match_operand:TF 2 "memory_operand" "=m")
> - (match_operand:TF 3 "register_operand" "w"))]
> +(define_insn "store_pair_dw_"
> +  [(set (match_operand:TX 0 "aarch64_mem_pair_operand" "=Ump")
> + (match_operand:TX 1 "register_operand" "w"))
> +   (set (match_operand:TX2 2 "memory_operand" "=m")
> + (match_operand:TX2 3 "register_operand" "w"))]
> "TARGET_SIMD &&
>  rtx_equal_p (XEXP (operands[2], 0),
>

Re: [PATCH 09/11] aarch64, testsuite: Fix up pr71727.c

2023-10-18 Thread Richard Sandiford
Alex Coplan  writes:
> The test is trying to check that we don't use q-register stores with
> -mstrict-align, so actually check specifically for that.
>
> This is a prerequisite to avoid regressing:
>
> scan-assembler-not "add\tx0, x0, :"
>
> with the upcoming ldp fusion pass, as we change where the ldps are
> formed such that a register is used rather than a symbolic (lo_sum)
> address for the first load.
>
> Bootstrapped/regtested as a series on aarch64-linux-gnu, OK for trunk?
>
> gcc/testsuite/ChangeLog:
>
>   * gcc.target/aarch64/pr71727.c: Adjust scan-assembler-not to
>   make sure we don't have q-register stores with -mstrict-align.

OK, thanks.

Richard

> ---
>  gcc/testsuite/gcc.target/aarch64/pr71727.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr71727.c 
> b/gcc/testsuite/gcc.target/aarch64/pr71727.c
> index 41fa72bc67e..226258a76fe 100644
> --- a/gcc/testsuite/gcc.target/aarch64/pr71727.c
> +++ b/gcc/testsuite/gcc.target/aarch64/pr71727.c
> @@ -30,4 +30,4 @@ _start (void)
>  }
>  
>  /* { dg-final { scan-assembler-times "mov\tx" 5 {target lp64} } } */
> -/* { dg-final { scan-assembler-not "add\tx0, x0, :" {target lp64} } } */
> +/* { dg-final { scan-assembler-not {st[rp]\tq[0-9]+} {target lp64} } } */


Re: [PATCH v3 1/2] c++: Initial support for P0847R7 (Deducing This) [PR102609]

2023-10-18 Thread Jason Merrill

On 10/18/23 13:28, waffl3x wrote:

I will try to get something done today, but I was struggling with
writing some of the tests, there's also a lot more of them now. I also
wrote a bunch of musings in comments that I would like feedback on.

My most concrete question is, how exactly should I be testing a
pedwarn, I want to test that I get the correct warning and error with
the separate flags, do I have to create two separate tests for each one?



Yes. I tend to use letter suffixes for tests that vary only in flags
(and expected results), e.g. feature1a.C, feature1b.C.


Will do.


Instead of OPT_Wpedantic, this should be controlled by
-Wc++23-extensions (OPT_Wc__23_extensions)


Yeah, I'll do this.


If you wanted, you could add a more specific warning option for this
(e.g. -Wc++23-explicit-this) which is also affected by
-Wc++23-extensions, but I would lean toward just using the existing
flag. Up to you.


I brought it up in irc and there was some pushback to my point of view
on it, so I'll just stick with OPT_Wc__23_extensions for now. I do
think a more sophisticated interface would be beneficial but I will
bring discussion around that up again in the future.

I've seen plenty of these G_ or _ macros on strings around like in
grokfndecl for these errors.

G_("static member function %qD cannot have cv-qualifier")
G_("non-member function %qD cannot have cv-qualifier")

G_("static member function %qD cannot have ref-qualifier")
G_("non-member function %qD cannot have ref-qualifier")

I have been able to figure out it relates to translation, but not
exactly what the protocol around them is.


The protocol is described in gcc/ABOUT-GCC-NLS.  In general, "strings" 
passed directly to a diagnostic function don't need any decoration, but 
if they're assigned to a variable first, they need G_() so they're 
recognized as diagnostic strings to be added to the translation table.


The _() macro is used for strings that are going to be passed to a %s, 
but better to avoid doing that for strings that need translation.  N_() 
is (rarely) used for strings that aren't diagnostic format strings, but 
get passed to another function that passes them to _().


Jason



Re: [PATCH 08/11] aarch64, testsuite: Tweak sve/pcs/args_9.c to allow stps

2023-10-18 Thread Richard Sandiford
Alex Coplan  writes:
> With the new ldp/stp pass enabled, there is a change in the codegen for
> this test as follows:
>
> add x8, sp, 16
> ptrue   p3.h, mul3
> str p3, [x8]
> -   str x8, [sp, 8]
> -   str x9, [sp]
> +   stp x9, x8, [sp]
> ptrue   p3.d, vl8
> ptrue   p2.s, vl7
> ptrue   p1.h, vl6
>
> i.e. we now form an stp that we were missing previously. This patch
> adjusts the scan-assembler such that it should pass whether or not
> we form the stp.
>
> Bootstrapped/regtested as a series on aarch64-linux-gnu, OK for trunk?
>
> gcc/testsuite/ChangeLog:
>
>   * gcc.target/aarch64/sve/pcs/args_9.c: Adjust scan-assemblers to
>   allow for stp.

OK, thanks.

Richard

> ---
>  gcc/testsuite/gcc.target/aarch64/sve/pcs/args_9.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_9.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_9.c
> index ad9affadf02..942a44ab448 100644
> --- a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_9.c
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_9.c
> @@ -45,5 +45,5 @@ caller (int64_t *x0, int16_t *x1, svbool_t p0)
>return svcntp_b8 (res, res);
>  }
>  
> -/* { dg-final { scan-assembler {\tptrue\t(p[0-9]+)\.b, mul3\n\tstr\t\1, 
> \[(x[0-9]+)\]\n.*\tstr\t\2, \[sp\]\n} } } */
> -/* { dg-final { scan-assembler {\tptrue\t(p[0-9]+)\.h, mul3\n\tstr\t\1, 
> \[(x[0-9]+)\]\n.*\tstr\t\2, \[sp, 8\]\n} } } */
> +/* { dg-final { scan-assembler {\tptrue\t(p[0-9]+)\.b, mul3\n\tstr\t\1, 
> \[(x[0-9]+)\]\n.*\t(?:str\t\2, \[sp\]|stp\t\2, x[0-9]+, \[sp\])\n} } } */
> +/* { dg-final { scan-assembler {\tptrue\t(p[0-9]+)\.h, mul3\n\tstr\t\1, 
> \[(x[0-9]+)\]\n.*\t(?:str\t\2, \[sp, 8\]|stp\tx[0-9]+, \2, \[sp\])\n} } } */


Re: [PATCH 07/11] aarch64, testsuite: Prevent stp in lr_free_1.c

2023-10-18 Thread Richard Sandiford
Alex Coplan  writes:
> The test is looking for individual stores which are able to be merged
> into stp instructions.  The test currently passes -fno-schedule-fusion
> -fno-peephole2, presumably to prevent these stores from being turned
> into stps, but this is no longer sufficient with the new ldp/stp fusion
> pass.
>
> As such, we add --param=aarch64-stp-policy=never to prevent stps being
> formed.
>
> Bootstrapped/regtested as a series on aarch64-linux-gnu, OK for trunk?
>
> gcc/testsuite/ChangeLog:
>
>   * gcc.target/aarch64/lr_free_1.c: Add
>   --param=aarch64-stp-policy=never to dg-options.

OK.  Thanks to Manos for adding this --param.

Richard

> ---
>  gcc/testsuite/gcc.target/aarch64/lr_free_1.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/gcc/testsuite/gcc.target/aarch64/lr_free_1.c 
> b/gcc/testsuite/gcc.target/aarch64/lr_free_1.c
> index 50dcf04e697..9949061096e 100644
> --- a/gcc/testsuite/gcc.target/aarch64/lr_free_1.c
> +++ b/gcc/testsuite/gcc.target/aarch64/lr_free_1.c
> @@ -1,5 +1,5 @@
>  /* { dg-do run } */
> -/* { dg-options "-fno-inline -O2 -fomit-frame-pointer -ffixed-x2 -ffixed-x3 
> -ffixed-x4 -ffixed-x5 -ffixed-x6 -ffixed-x7 -ffixed-x8 -ffixed-x9 -ffixed-x10 
> -ffixed-x11 -ffixed-x12 -ffixed-x13 -ffixed-x14 -ffixed-x15 -ffixed-x16 
> -ffixed-x17 -ffixed-x18 -ffixed-x19 -ffixed-x20 -ffixed-x21 -ffixed-x22 
> -ffixed-x23 -ffixed-x24 -ffixed-x25 -ffixed-x26 -ffixed-x27 -ffixed-28 
> -ffixed-29 --save-temps -mgeneral-regs-only -fno-ipa-cp -fno-schedule-fusion 
> -fno-peephole2" } */
> +/* { dg-options "-fno-inline -O2 -fomit-frame-pointer -ffixed-x2 -ffixed-x3 
> -ffixed-x4 -ffixed-x5 -ffixed-x6 -ffixed-x7 -ffixed-x8 -ffixed-x9 -ffixed-x10 
> -ffixed-x11 -ffixed-x12 -ffixed-x13 -ffixed-x14 -ffixed-x15 -ffixed-x16 
> -ffixed-x17 -ffixed-x18 -ffixed-x19 -ffixed-x20 -ffixed-x21 -ffixed-x22 
> -ffixed-x23 -ffixed-x24 -ffixed-x25 -ffixed-x26 -ffixed-x27 -ffixed-28 
> -ffixed-29 --save-temps -mgeneral-regs-only -fno-ipa-cp -fno-schedule-fusion 
> -fno-peephole2 --param=aarch64-stp-policy=never" } */
>  
>  extern void abort ();
>  


Re: [PATCH 04/11] rtl-ssa: Support inferring uses of mem in change_insns

2023-10-18 Thread Richard Sandiford
Alex Coplan  writes:
> Currently, rtl_ssa::change_insns requires all new uses and defs to be
> specified explicitly.  This turns out to be rather inconvenient for
> forming load pairs in the new aarch64 load pair pass, as the pass has to
> determine which mem def the final load pair consumes, and then obtain or
> create a suitable use (i.e. significant bookkeeping, just to keep the
> RTL-SSA IR consistent).  It turns out to be much more convenient to
> allow change_insns to infer which def is consumed and create a suitable
> use of mem itself.  This patch does that.
>
> Bootstrapped/regtested as a series on aarch64-linux-gnu, OK for trunk?
>
> gcc/ChangeLog:
>
>   * rtl-ssa/changes.cc (function_info::finalize_new_accesses): Add new
>   parameter to give final insn position, infer use of mem if it isn't
>   specified explicitly.
>   (function_info::change_insns): Pass down final insn position to
>   finalize_new_accesses.
>   * rtl-ssa/functions.h: Add parameter to finalize_new_accesses.

OK, thanks.

Richard

> ---
>  gcc/rtl-ssa/changes.cc  | 31 ---
>  gcc/rtl-ssa/functions.h |  2 +-
>  2 files changed, 29 insertions(+), 4 deletions(-)
>
> diff --git a/gcc/rtl-ssa/changes.cc b/gcc/rtl-ssa/changes.cc
> index c48ddd2463c..523ad60d7d8 100644
> --- a/gcc/rtl-ssa/changes.cc
> +++ b/gcc/rtl-ssa/changes.cc
> @@ -370,8 +370,11 @@ update_insn_in_place (insn_change )
>  // Finalize the new list of definitions and uses in CHANGE, removing
>  // any uses and definitions that are no longer needed, and converting
>  // pending clobbers into actual definitions.
> +//
> +// POS gives the final position of INSN, which hasn't yet been moved into
> +// place.
>  void
> -function_info::finalize_new_accesses (insn_change )
> +function_info::finalize_new_accesses (insn_change , insn_info *pos)
>  {
>insn_info *insn = change.insn ();
>  
> @@ -462,13 +465,34 @@ function_info::finalize_new_accesses (insn_change 
> )
>// Add (possibly temporary) uses to m_temp_uses for each resource.
>// If there are multiple references to the same resource, aggregate
>// information in the modes and flags.
> +  use_info *mem_use = nullptr;
>for (rtx_obj_reference ref : properties.refs ())
>  if (ref.is_read ())
>{
>   unsigned int regno = ref.regno;
>   machine_mode mode = ref.is_reg () ? ref.mode : BLKmode;
>   use_info *use = find_access (unshared_uses, ref.regno);
> - gcc_assert (use);
> + if (!use)
> +   {
> + // For now, we only support inferring uses of mem.
> + gcc_assert (regno == MEM_REGNO);
> +
> + if (mem_use)
> +   {
> + mem_use->record_reference (ref, false);
> + continue;
> +   }
> +
> + resource_info resource { mode, regno };
> + auto def = find_def (resource, pos).prev_def (pos);
> + auto set = safe_dyn_cast  (def);
> + gcc_assert (set);
> + mem_use = allocate (insn, resource, set);
> + mem_use->record_reference (ref, true);
> + m_temp_uses.safe_push (mem_use);
> + continue;
> +   }
> +
>   if (use->m_has_been_superceded)
> {
>   // This is the first reference to the resource.
> @@ -656,7 +680,8 @@ function_info::change_insns (array_slice 
> changes)
>  
> // Finalize the new list of accesses for the change.  Don't install
> // them yet, so that we still have access to the old lists below.
> -   finalize_new_accesses (change);
> +   finalize_new_accesses (change,
> +  placeholder ? placeholder : insn);
>   }
>placeholders[i] = placeholder;
>  }
> diff --git a/gcc/rtl-ssa/functions.h b/gcc/rtl-ssa/functions.h
> index d7da9774213..73690a0e63b 100644
> --- a/gcc/rtl-ssa/functions.h
> +++ b/gcc/rtl-ssa/functions.h
> @@ -265,7 +265,7 @@ private:
>  
>insn_info *add_placeholder_after (insn_info *);
>void possibly_queue_changes (insn_change &);
> -  void finalize_new_accesses (insn_change &);
> +  void finalize_new_accesses (insn_change &, insn_info *);
>void apply_changes_to_insn (insn_change &);
>  
>void init_function_data ();


Re: [PATCH 03/11] rtl-ssa: Add entry point to allow re-parenting uses

2023-10-18 Thread Richard Sandiford
Alex Coplan  writes:
> This is needed by the upcoming aarch64 load pair pass, as it can
> re-order stores (when alias analysis determines this is safe) and thus
> change which mem def a given use consumes (in the RTL-SSA view, there is
> no alias disambiguation of memory).
>
> Bootstrapped/regtested as a series on aarch64-linux-gnu, OK for trunk?
>
> gcc/ChangeLog:
>
>   * rtl-ssa/accesses.cc (function_info::reparent_use): New.
>   * rtl-ssa/functions.h (function_info): Declare new member
>   function reparent_use.

OK, thanks.

Richard

> ---
>  gcc/rtl-ssa/accesses.cc | 8 
>  gcc/rtl-ssa/functions.h | 3 +++
>  2 files changed, 11 insertions(+)
>
> diff --git a/gcc/rtl-ssa/accesses.cc b/gcc/rtl-ssa/accesses.cc
> index f12b5f4dd77..774ab9d99ee 100644
> --- a/gcc/rtl-ssa/accesses.cc
> +++ b/gcc/rtl-ssa/accesses.cc
> @@ -1239,6 +1239,14 @@ function_info::add_use (use_info *use)
>  insert_use_before (use, neighbor->value ());
>  }
>  
> +void
> +function_info::reparent_use (use_info *use, set_info *new_def)
> +{
> +  remove_use (use);
> +  use->set_def (new_def);
> +  add_use (use);
> +}
> +
>  // If USE has a known definition, remove USE from that definition's list
>  // of uses.  Also remove if it from the associated splay tree, if any.
>  void
> diff --git a/gcc/rtl-ssa/functions.h b/gcc/rtl-ssa/functions.h
> index 8b53b264064..d7da9774213 100644
> --- a/gcc/rtl-ssa/functions.h
> +++ b/gcc/rtl-ssa/functions.h
> @@ -159,6 +159,9 @@ public:
>// Like change_insns, but for a single change CHANGE.
>void change_insn (insn_change );
>  
> +  // Given a use USE, re-parent it to get its def from NEW_DEF.
> +  void reparent_use (use_info *use, set_info *new_def);
> +
>// If the changes that have been made to instructions require updates
>// to the CFG, perform those updates now.  Return true if something 
> changed.
>// If it did:


Re: [PATCH 02/11] rtl-ssa: Add drop_memory_access helper

2023-10-18 Thread Richard Sandiford
Alex Coplan  writes:
> Add a helper routine to access-utils.h which removes the memory access
> from an access_array, if it has one.
>
> Bootstrapped/regtested as a series on aarch64-linux-gnu, OK for trunk?
>
> gcc/ChangeLog:
>
>   * rtl-ssa/access-utils.h (drop_memory_access): New.
> ---
>  gcc/rtl-ssa/access-utils.h | 11 +++
>  1 file changed, 11 insertions(+)
>
> diff --git a/gcc/rtl-ssa/access-utils.h b/gcc/rtl-ssa/access-utils.h
> index fb2c2d3..0c108b18bb8 100644
> --- a/gcc/rtl-ssa/access-utils.h
> +++ b/gcc/rtl-ssa/access-utils.h
> @@ -51,6 +51,17 @@ memory_access (T accesses) -> decltype (accesses[0])
>return nullptr;
>  }
>  
> +template
> +inline T
> +drop_memory_access (T accesses)
> +{
> +  if (!memory_access (accesses))
> +return accesses;
> +
> +  access_array arr (accesses);
> +  return T (arr.begin (), accesses.size () - 1);
> +}

There ought to be a comment above the function.  OK with that change, thanks.

Richard

> +
>  // If sorted array ACCESSES includes a reference to REGNO, return the
>  // access, otherwise return null.
>  template


Re: [PATCH 01/11] rtl-ssa: Fix bug in function_info::add_insn_after

2023-10-18 Thread Richard Sandiford
Alex Coplan  writes:
> In the case that !insn->is_debug_insn () && next->is_debug_insn (), this
> function was missing an update of the prev pointer on the first nondebug
> insn following the sequence of debug insns starting at next.
>
> This can lead to corruption of the insn chain, in that we end up with:
>
>   insn->next_any_insn ()->prev_any_insn () != insn
>
> in this case.  This patch fixes that.
>
> Bootstrapped/regtested as a series on aarch64-linux-gnu, OK for trunk?
>
> gcc/ChangeLog:
>
>   * rtl-ssa/insns.cc (function_info::add_insn_after): Ensure we
>   update the prev pointer on the following nondebug insn in the
>   case that !insn->is_debug_insn () && next->is_debug_insn ().

OK, thanks.

Richard

> ---
>  gcc/rtl-ssa/insns.cc | 14 +++---
>  1 file changed, 11 insertions(+), 3 deletions(-)
>
> diff --git a/gcc/rtl-ssa/insns.cc b/gcc/rtl-ssa/insns.cc
> index a0c2fec2b70..f970375d906 100644
> --- a/gcc/rtl-ssa/insns.cc
> +++ b/gcc/rtl-ssa/insns.cc
> @@ -291,9 +291,17 @@ function_info::add_insn_after (insn_info *insn, 
> insn_info *after)
> first->set_last_debug_insn (insn);
>   }
>else // !insn->is_debug_insn () && next->is_debug_insn ()
> - // At present we don't (need to) support inserting a nondebug
> - // instruction between two existing debug instructions.
> - gcc_assert (!after->is_debug_insn ());
> + {
> +   // At present we don't (need to) support inserting a nondebug
> +   // instruction between two existing debug instructions.
> +   gcc_assert (!after->is_debug_insn ());
> +
> +   // Find the next nondebug insn and update its previous pointer
> +   // to point to INSN.
> +   auto next_nondebug = next->last_debug_insn ()->next_any_insn ();
> +   gcc_checking_assert (!next_nondebug->is_debug_insn ());
> +   next_nondebug->set_prev_sametype_insn (insn);
> + }
>  
>// If AFTER and NEXT are separated by at least two points, we can
>// use a unique point number for INSN.  Otherwise INSN will have


Re: PR111648: Fix wrong code-gen due to incorrect VEC_PERM_EXPR folding

2023-10-18 Thread Richard Sandiford
Prathamesh Kulkarni  writes:
> On Tue, 17 Oct 2023 at 02:40, Richard Sandiford
>  wrote:
>> Prathamesh Kulkarni  writes:
>> > diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc
>> > index 4f8561509ff..55a6a68c16c 100644
>> > --- a/gcc/fold-const.cc
>> > +++ b/gcc/fold-const.cc
>> > @@ -10684,9 +10684,8 @@ valid_mask_for_fold_vec_perm_cst_p (tree arg0, 
>> > tree arg1,
>> >
>> >/* Ensure that the stepped sequence always selects from the same
>> >input pattern.  */
>> > -  unsigned arg_npatterns
>> > - = ((q1 & 1) == 0) ? VECTOR_CST_NPATTERNS (arg0)
>> > -   : VECTOR_CST_NPATTERNS (arg1);
>> > +  tree arg = ((q1 & 1) == 0) ? arg0 : arg1;
>> > +  unsigned arg_npatterns = VECTOR_CST_NPATTERNS (arg);
>> >
>> >if (!multiple_p (step, arg_npatterns))
>> >   {
>> > @@ -10694,6 +10693,29 @@ valid_mask_for_fold_vec_perm_cst_p (tree arg0, 
>> > tree arg1,
>> >   *reason = "step is not multiple of npatterns";
>> > return false;
>> >   }
>> > +
>> > +  /* If a1 chooses base element from arg, ensure that it's a natural
>> > +  stepped sequence, ie, (arg[2] - arg[1]) == (arg[1] - arg[0])
>> > +  to preserve arg's encoding.  */
>> > +
>> > +  unsigned HOST_WIDE_INT index;
>> > +  if (!r1.is_constant ())
>> > + return false;
>> > +  if (index < arg_npatterns)
>> > + {
>>
>> I don't know whether it matters in practice, but I think the two conditions
>> above are more natural as:
>>
>> if (maybe_lt (r1, arg_npatterns))
>>   {
>> unsigned HOST_WIDE_INT index;
>> if (!r1.is_constant ())
>>   return false;
>>
>> ...[code below]...
>>   }
>>
>> > +   tree arg_elem0 = vector_cst_elt (arg, index);
>> > +   tree arg_elem1 = vector_cst_elt (arg, index + arg_npatterns);
>> > +   tree arg_elem2 = vector_cst_elt (arg, index + arg_npatterns * 2);
>> > +
>> > +   if (!operand_equal_p (const_binop (MINUS_EXPR, arg_elem2, 
>> > arg_elem1),
>> > + const_binop (MINUS_EXPR, arg_elem1, 
>> > arg_elem0),
>> > + 0))
>>
>> This needs to check whether const_binop returns null.  Maybe:
>>
>>tree step1, step2;
>>if (!(step1 = const_binop (MINUS_EXPR, arg_elem1, arg_elem0))
>>|| !(step2 = const_binop (MINUS_EXPR, arg_elem2, arg_elem1))
>>|| !operand_equal_p (step1, step2, 0))
>>
>> OK with those changes, thanks.
> Hi Richard,
> Thanks for the suggestions, updated the attached patch accordingly.
> Bootstrapped+tested with and without SVE on aarch64-linux-gnu and
> x86_64-linux-gnu.
> OK to commit ?

Yes, thanks.

Richard

>
> Thanks,
> Prathamesh
>>
>> Richard
>>
>> > + {
>> > +   if (reason)
>> > + *reason = "not a natural stepped sequence";
>> > +   return false;
>> > + }
>> > + }
>> >  }
>> >
>> >return true;
>> > @@ -17161,7 +17183,8 @@ namespace test_fold_vec_perm_cst {
>> >  static tree
>> >  build_vec_cst_rand (machine_mode vmode, unsigned npatterns,
>> >   unsigned nelts_per_pattern,
>> > - int step = 0, int threshold = 100)
>> > + int step = 0, bool natural_stepped = false,
>> > + int threshold = 100)
>> >  {
>> >tree inner_type = lang_hooks.types.type_for_mode (GET_MODE_INNER 
>> > (vmode), 1);
>> >tree vectype = build_vector_type_for_mode (inner_type, vmode);
>> > @@ -17176,17 +17199,28 @@ build_vec_cst_rand (machine_mode vmode, unsigned 
>> > npatterns,
>> >
>> >// Fill a1 for each pattern
>> >for (unsigned i = 0; i < npatterns; i++)
>> > -builder.quick_push (build_int_cst (inner_type, rand () % threshold));
>> > -
>> > +{
>> > +  tree a1;
>> > +  if (natural_stepped)
>> > + {
>> > +   tree a0 = builder[i];
>> > +   wide_int a0_val = wi::to_wide (a0);
>> > +   wide_int a1_val = a0_val + step;
>> > +   a1 = wide_int_to_tree (inner_type, a1_val);
>> > + }
>> > +  else
>> > + a1 = build_int_cst (inner_type, rand () % threshold);
>> > +  builder.quick_push (a1);
>> > +}
>> >if (nelts_per_pattern == 2)
>> >  return builder.build ();
>> >
>> >for (unsigned i = npatterns * 2; i < npatterns * nelts_per_pattern; i++)
>> >  {
>> >tree prev_elem = builder[i - npatterns];
>> > -  int prev_elem_val = TREE_INT_CST_LOW (prev_elem);
>> > -  int val = prev_elem_val + step;
>> > -  builder.quick_push (build_int_cst (inner_type, val));
>> > +  wide_int prev_elem_val = wi::to_wide (prev_elem);
>> > +  wide_int val = prev_elem_val + step;
>> > +  builder.quick_push (wide_int_to_tree (inner_type, val));
>> >  }
>> >
>> >return builder.build ();
>> > @@ -17432,7 +17466,7 @@ test_nunits_min_2 (machine_mode vmode)
>> >and step (a2 - a1) = 1, step is not a multiple of npatterns
>> >in input vector. So return NULL_TREE.  */
>> >{
>> > - tree 

Re: [PATCH v3 1/2] c++: Initial support for P0847R7 (Deducing This) [PR102609]

2023-10-18 Thread Jakub Jelinek
On Wed, Oct 18, 2023 at 05:28:10PM +, waffl3x wrote:
> I've seen plenty of these G_ or _ macros on strings around like in
> grokfndecl for these errors.
> 
> G_("static member function %qD cannot have cv-qualifier")
> G_("non-member function %qD cannot have cv-qualifier")
> 
> G_("static member function %qD cannot have ref-qualifier")
> G_("non-member function %qD cannot have ref-qualifier")
> 
> I have been able to figure out it relates to translation, but not
> exactly what the protocol around them is. I think in my original patch
> I had refactored this code a bunch, I figured adding a 3rd case to it
> justifies a refactor. I think I forgot to add those changes to the
> original patch, either that or I undid it or moved it somewhere else.
> Anyway, the point is, coming back to it now to re-add those diagnostics
> I realized I probably shouldn't have changed those strings.
> 
> I also have been wondering whether I should be putting macros on any
> strings I add, it seemed like there might have been a macro for text
> that needs translation. Is this something I should be doing?

There are different kinds of format strings in GCC, the most common
are the gcc-internal-format strings.  If you call a function which
is expected to take such translatable format string (in particular
a function which takes a gmsgid named argument like error, error_at,
pedwarn, warning_at, ...) and pass a string literal to that function,
nothing needs to be marked in a special way, both gcc/po/exgettext
is able to collect such literals into gcc/po/gcc.pot for translations
and the function is supposed to use gettext etc. to translate it
- e.g. see diagnostic_set_info using _(gmsgid) for that.
But, if there is e.g. a temporary pointer var which points to format
strings and only that is eventually passed to the diagnostic functions,
gcc/po/exgettext won't be able to collect such literals, which is where
the G_() macro comes into play and one marks the string as
gcc-internal-format with it; the translation is still handled by the
diagnostic function at runtime.  The N_() macro is similar but for c-format
strings instead.  The _() macro both collects for translations if it is
used with string literal, and expands to gettext call to translate it at
runtime, which is something that should be avoided if something translates
it again.

And another i18n rule is that one shouldn't try to construct diagnostic
messages from parts of english sentences, it is fine to fill in with %s/%qs
etc. language keywords etc. but otherwise the format string should contain
the whole diagnostic line, so that translators can reorder the words etc.

Jakub



Re: [x86 PATCH] PR 106245: Split (x<<31)>>31 as -(x&1) in i386.md

2023-10-18 Thread Uros Bizjak
On Tue, Oct 17, 2023 at 7:54 PM Roger Sayle  wrote:
>
>
> Hi Uros,
> Thanks for the speedy review.
>
> > From: Uros Bizjak 
> > Sent: 17 October 2023 17:38
> >
> > On Tue, Oct 17, 2023 at 3:08 PM Roger Sayle 
> > wrote:
> > >
> > >
> > > This patch is the backend piece of a solution to PRs 101955 and
> > > 106245, that adds a define_insn_and_split to the i386 backend, to
> > > perform sign extension of a single (least significant) bit using AND $1 
> > > then NEG.
> > >
> > > Previously, (x<<31)>>31 would be generated as
> > >
> > > sall$31, %eax   // 3 bytes
> > > sarl$31, %eax   // 3 bytes
> > >
> > > with this patch the backend now generates:
> > >
> > > andl$1, %eax// 3 bytes
> > > negl%eax// 2 bytes
> > >
> > > Not only is this smaller in size, but microbenchmarking confirms that
> > > it's a performance win on both Intel and AMD; Intel sees only a 2%
> > > improvement (perhaps just a size effect), but AMD sees a 7% win.
> > >
> > > This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> > > and make -k check, both with and without --target_board=unix{-m32}
> > > with no new failures.  Ok for mainline?
> > >
> > >
> > > 2023-10-17  Roger Sayle  
> > >
> > > gcc/ChangeLog
> > > PR middle-end/101955
> > > PR tree-optimization/106245
> > > * config/i386/i386.md (*extv_1_0): New 
> > > define_insn_and_split.
> > >
> > > gcc/testsuite/ChangeLog
> > > PR middle-end/101955
> > > PR tree-optimization/106245
> > > * gcc.target/i386/pr106245-2.c: New test case.
> > > * gcc.target/i386/pr106245-3.c: New 32-bit test case.
> > > * gcc.target/i386/pr106245-4.c: New 64-bit test case.
> > > * gcc.target/i386/pr106245-5.c: Likewise.
> >
> > +;; Split sign-extension of single least significant bit as and x,$1;neg
> > +x (define_insn_and_split "*extv_1_0"
> > +  [(set (match_operand:SWI48 0 "register_operand" "=r")
> > + (sign_extract:SWI48 (match_operand:SWI48 1 "register_operand" "0")
> > +(const_int 1)
> > +(const_int 0)))
> > +   (clobber (reg:CC FLAGS_REG))]
> > +  ""
> > +  "#"
> > +  "&& 1"
> >
> > No need to use "&&" for an empty insn constraint. Just use 
> > "reload_completed" in
> > this case.
> >
> > +  [(parallel [(set (match_dup 0) (and:SWI48 (match_dup 1) (const_int 1)))
> > +  (clobber (reg:CC FLAGS_REG))])
> > +   (parallel [(set (match_dup 0) (neg:SWI48 (match_dup 0)))
> > +  (clobber (reg:CC FLAGS_REG))])])
> >
> > Did you intend to split this after reload? If this is the case, then 
> > reload_completed
> > is missing.
>
> Because this splitter neither required the allocation of a new pseudo, nor a
> hard register assignment, i.e. it's a splitter that can be run before or after
> reload, it's written to split "whenever".  If you'd prefer it to only split 
> after
> reload, I agree a "reload_completed" can be added (alternatively, adding
> "ix86_pre_reload_split ()" would also work).

No, this part is OK. I just forgot that we have universal splitters ;)

> I now see from "*load_tp_" that "" is perhaps preferred over "&& 1"
> In these cases.  Please let me know which you prefer.

"" please for an empty insn constraint.

OK otherwise.

Thanks,
Uros.


Re: [PATCH v3 1/2] c++: Initial support for P0847R7 (Deducing This) [PR102609]

2023-10-18 Thread waffl3x
> > I will try to get something done today, but I was struggling with
> > writing some of the tests, there's also a lot more of them now. I also
> > wrote a bunch of musings in comments that I would like feedback on.
> > 
> > My most concrete question is, how exactly should I be testing a
> > pedwarn, I want to test that I get the correct warning and error with
> > the separate flags, do I have to create two separate tests for each one?
> 
> 
> Yes. I tend to use letter suffixes for tests that vary only in flags
> (and expected results), e.g. feature1a.C, feature1b.C.

Will do.

> Instead of OPT_Wpedantic, this should be controlled by
> -Wc++23-extensions (OPT_Wc__23_extensions)

Yeah, I'll do this.

> If you wanted, you could add a more specific warning option for this
> (e.g. -Wc++23-explicit-this) which is also affected by
> -Wc++23-extensions, but I would lean toward just using the existing
> flag. Up to you.

I brought it up in irc and there was some pushback to my point of view
on it, so I'll just stick with OPT_Wc__23_extensions for now. I do
think a more sophisticated interface would be beneficial but I will
bring discussion around that up again in the future.

I've seen plenty of these G_ or _ macros on strings around like in
grokfndecl for these errors.

G_("static member function %qD cannot have cv-qualifier")
G_("non-member function %qD cannot have cv-qualifier")

G_("static member function %qD cannot have ref-qualifier")
G_("non-member function %qD cannot have ref-qualifier")

I have been able to figure out it relates to translation, but not
exactly what the protocol around them is. I think in my original patch
I had refactored this code a bunch, I figured adding a 3rd case to it
justifies a refactor. I think I forgot to add those changes to the
original patch, either that or I undid it or moved it somewhere else.
Anyway, the point is, coming back to it now to re-add those diagnostics
I realized I probably shouldn't have changed those strings.

I also have been wondering whether I should be putting macros on any
strings I add, it seemed like there might have been a macro for text
that needs translation. Is this something I should be doing?

Alex



[committed] pru: Implement TARGET_INSN_COST

2023-10-18 Thread Dimitar Dimitrov
This patch slightly improves the embench-iot benchmark score for
PRU code size.  There is also small improvement in a few real-world
firmware programs.

  Embench-iot size
  --
  Benchmark  before   afterdelta
  -    -
  aha-mont64  4.154.15 0
  crc32   6.046.04 0
  cubic  21.64   21.62 -0.02
  edn 6.376.37 0
  huffbench  18.63   18.55 -0.08
  matmult-int 5.445.44 0
  md5sum 25.56   25.43 -0.13
  minver 12.82   12.76 -0.06
  nbody  15.09   14.97 -0.12
  nettle-aes  4.754.75 0
  nettle-sha256   4.674.67 0
  nsichneu3.773.77 0
  picojpeg4.114.11 0
  primecount  7.907.90 0
  qrduino 7.187.16 -0.02
  sglib-combined 13.63   13.59 -0.04
  slre5.195.19 0
  st 14.23   14.12 -0.11
  statemate   2.342.34 0
  tarfind36.85   36.64 -0.21
  ud 10.51   10.46 -0.05
  wikisort7.447.41 -0.03
  -  -   -
  Geometric mean  8.428.40 -0.02
  Geometric SD2.002.00 0
  Geometric range12.68   12.62 -0.06

Reg-tested pru-unknown-elf, and committed to trunk.

gcc/ChangeLog:

* config/pru/pru.cc (pru_insn_cost): New function.
(TARGET_INSN_COST): Define for PRU.

Signed-off-by: Dimitar Dimitrov 
---
 gcc/config/pru/pru.cc | 36 
 1 file changed, 36 insertions(+)

diff --git a/gcc/config/pru/pru.cc b/gcc/config/pru/pru.cc
index 6e8112be64a..fd1924e38dc 100644
--- a/gcc/config/pru/pru.cc
+++ b/gcc/config/pru/pru.cc
@@ -783,6 +783,39 @@ pru_rtx_costs (rtx x, machine_mode mode,
   }
 }
 }
+
+/* Insn costs on PRU are straightforward because:
+ - Insns emit 0, 1 or more instructions.
+ - All instructions are 32-bit length.
+ - All instructions execute in 1 cycle (sans memory access delays).
+   The "length" attribute maps nicely to the insn cost.  */
+
+static int
+pru_insn_cost (rtx_insn *insn, bool speed)
+{
+  /* Use generic cost calculation for unrecognized insns.  */
+  if (recog_memoized (insn) < 0)
+return pattern_cost (insn, speed);
+
+  unsigned int len = get_attr_length (insn);
+
+  gcc_assert ((len % 4) == 0);
+
+  int cost = COSTS_N_INSNS (len / 4);
+  /* Some insns have zero length (e.g. blockage, pruloop_end).
+ In such cases give the minimum cost, because a return of
+ 0 would incorrectly indicate that the insn cost is unknown.  */
+  if (cost == 0)
+cost = 1;
+
+  /* Writes are usually posted, so they take 1 cycle.  Reads
+ from DMEM usually take 3 cycles.
+ See TI document SPRACE8A, Device-Specific PRU Read Latency Values.  */
+  if (speed && get_attr_type (insn) == TYPE_LD)
+cost += COSTS_N_INSNS (2);
+
+  return cost;
+}
 
 static GTY(()) rtx eqdf_libfunc;
 static GTY(()) rtx nedf_libfunc;
@@ -3175,6 +3208,9 @@ pru_unwind_word_mode (void)
 #undef TARGET_RTX_COSTS
 #define TARGET_RTX_COSTS pru_rtx_costs
 
+#undef TARGET_INSN_COST
+#define TARGET_INSN_COST pru_insn_cost
+
 #undef TARGET_PRINT_OPERAND
 #define TARGET_PRINT_OPERAND pru_print_operand
 
-- 
2.41.0



Re: [PATCH V2 00/14] Refactor and cleanup vsetvl pass

2023-10-18 Thread Patrick O'Neill

Hi Luhua,

Here's the excerpts from the debug log. I think the full log files are 
too large to send over email.


rv32_gcv avl_single-32.c:

Executing on host: 
/scratch/tc-testing/tc-oct-17-vsetvli-refactor/build/build-gcc-linux-stage2/gcc/xgcc
 
-B/scratch/tc-testing/tc-oct-17-vsetvli-refactor/build/build-gcc-linux-stage2/gcc/
  
/scratch/tc-testing/tc-oct-17-vsetvli-refactor/gcc/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/avl_single-34.c
  -march=rv32gcv -mabi=ilp32d -mcmodel=medlow   -fdiagnostics-plain-output  -O0 
-march=rv32gcv -mabi=ilp32 -fno-schedule-insns -fno-schedule-insns2 
-fno-tree-vectorize -ffat-lto-objects -fno-ident -S   -o avl_single-34.s
(timeout = 600)
spawn -ignore SIGHUP 
/scratch/tc-testing/tc-oct-17-vsetvli-refactor/build/build-gcc-linux-stage2/gcc/xgcc
 
-B/scratch/tc-testing/tc-oct-17-vsetvli-refactor/build/build-gcc-linux-stage2/gcc/
 
/scratch/tc-testing/tc-oct-17-vsetvli-refactor/gcc/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/avl_single-34.c
 -march=rv32gcv -mabi=ilp32d -mcmodel=medlow -fdiagnostics-plain-output -O0 
-march=rv32gcv -mabi=ilp32 -fno-schedule-insns -fno-schedule-insns2 
-fno-tree-vectorize -ffat-lto-objects -fno-ident -S -o avl_single-34.s
PASS: gcc.target/riscv/rvv/vsetvl/avl_single-34.c   -O0  (test for excess 
errors)
Executing on host: 
/scratch/tc-testing/tc-oct-17-vsetvli-refactor/build/build-gcc-linux-stage2/gcc/xgcc
 
-B/scratch/tc-testing/tc-oct-17-vsetvli-refactor/build/build-gcc-linux-stage2/gcc/
  
/scratch/tc-testing/tc-oct-17-vsetvli-refactor/gcc/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/avl_single-34.c
  -march=rv32gcv -mabi=ilp32d -mcmodel=medlow   -fdiagnostics-plain-output  -O1 
-march=rv32gcv -mabi=ilp32 -fno-schedule-insns -fno-schedule-insns2 
-fno-tree-vectorize -ffat-lto-objects -fno-ident -S   -o avl_single-34.s
(timeout = 600)
spawn -ignore SIGHUP 
/scratch/tc-testing/tc-oct-17-vsetvli-refactor/build/build-gcc-linux-stage2/gcc/xgcc
 
-B/scratch/tc-testing/tc-oct-17-vsetvli-refactor/build/build-gcc-linux-stage2/gcc/
 
/scratch/tc-testing/tc-oct-17-vsetvli-refactor/gcc/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/avl_single-34.c
 -march=rv32gcv -mabi=ilp32d -mcmodel=medlow -fdiagnostics-plain-output -O1 
-march=rv32gcv -mabi=ilp32 -fno-schedule-insns -fno-schedule-insns2 
-fno-tree-vectorize -ffat-lto-objects -fno-ident -S -o avl_single-34.s
PASS: gcc.target/riscv/rvv/vsetvl/avl_single-34.c   -O1  (test for excess 
errors)
PASS: gcc.target/riscv/rvv/vsetvl/avl_single-34.c   -O1   scan-assembler-times 
vsetvli\\s+zero,\\s*[a-x0-9]+,\\s*e8,\\s*mf8,\\s*tu,\\s*m[au] 1
gcc.target/riscv/rvv/vsetvl/avl_single-34.c   -O1  : vsetvli found 2 times
FAIL: gcc.target/riscv/rvv/vsetvl/avl_single-34.c   -O1   scan-assembler-times 
vsetvli 1
Executing on host: 
/scratch/tc-testing/tc-oct-17-vsetvli-refactor/build/build-gcc-linux-stage2/gcc/xgcc
 
-B/scratch/tc-testing/tc-oct-17-vsetvli-refactor/build/build-gcc-linux-stage2/gcc/
  
/scratch/tc-testing/tc-oct-17-vsetvli-refactor/gcc/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/avl_single-34.c
  -march=rv32gcv -mabi=ilp32d -mcmodel=medlow   -fdiagnostics-plain-output  -O2 
-march=rv32gcv -mabi=ilp32 -fno-schedule-insns -fno-schedule-insns2 
-fno-tree-vectorize -ffat-lto-objects -fno-ident -S   -o avl_single-34.s
(timeout = 600)
spawn -ignore SIGHUP 
/scratch/tc-testing/tc-oct-17-vsetvli-refactor/build/build-gcc-linux-stage2/gcc/xgcc
 
-B/scratch/tc-testing/tc-oct-17-vsetvli-refactor/build/build-gcc-linux-stage2/gcc/
 
/scratch/tc-testing/tc-oct-17-vsetvli-refactor/gcc/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/avl_single-34.c
 -march=rv32gcv -mabi=ilp32d -mcmodel=medlow -fdiagnostics-plain-output -O2 
-march=rv32gcv -mabi=ilp32 -fno-schedule-insns -fno-schedule-insns2 
-fno-tree-vectorize -ffat-lto-objects -fno-ident -S -o avl_single-34.s
PASS: gcc.target/riscv/rvv/vsetvl/avl_single-34.c   -O2  (test for excess 
errors)
PASS: gcc.target/riscv/rvv/vsetvl/avl_single-34.c   -O2   scan-assembler-times 
vsetvli\\s+zero,\\s*[a-x0-9]+,\\s*e8,\\s*mf8,\\s*tu,\\s*m[au] 1
gcc.target/riscv/rvv/vsetvl/avl_single-34.c   -O2  : vsetvli found 2 times
FAIL: gcc.target/riscv/rvv/vsetvl/avl_single-34.c   -O2   scan-assembler-times 
vsetvli 1
Executing on host: 
/scratch/tc-testing/tc-oct-17-vsetvli-refactor/build/build-gcc-linux-stage2/gcc/xgcc
 
-B/scratch/tc-testing/tc-oct-17-vsetvli-refactor/build/build-gcc-linux-stage2/gcc/
  
/scratch/tc-testing/tc-oct-17-vsetvli-refactor/gcc/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/avl_single-34.c
  -march=rv32gcv -mabi=ilp32d -mcmodel=medlow   -fdiagnostics-plain-output  -O3 
-fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions 
-march=rv32gcv -mabi=ilp32 -fno-schedule-insns -fno-schedule-insns2 
-fno-tree-vectorize -ffat-lto-objects -fno-ident -S   -o avl_single-34.s
(timeout = 600)
spawn -ignore SIGHUP 
/scratch/tc-testing/tc-oct-17-vsetvli-refactor/build/build-gcc-linux-stage2/gcc/xgcc
 

[avr,committed] LibF7: Implement a function that was missing for devices without MUL.

2023-10-18 Thread Georg-Johann Lay

This implements the worker function for double multiplication
for devices without MUL instruction.

Johann

--

LibF7: Implement mul_mant for devices without MUL instruction.

libgcc/config/avr/libf7/
* libf7-asm.sx (mul_mant): Implement for devices without MUL.
* asm-defs.h (wmov) [!HAVE_MUL]: Fix regno computation.
* t-libf7 (F7_ASM_FLAGS): Add -g0.

diff --git a/libgcc/config/avr/libf7/asm-defs.h 
b/libgcc/config/avr/libf7/asm-defs.h

index 4cfd3e61cbb..a50260a162f 100644
--- a/libgcc/config/avr/libf7/asm-defs.h
+++ b/libgcc/config/avr/libf7/asm-defs.h
@@ -134,14 +134,14 @@
 ..regno = 0

 .irpreg,\
-X, x, XL, xl, Xl, xL, x, x  \
+X, x, XL, xl, Xl, xL, x, x, \
 Y, y, YL, yl, Yl, yL, y, y, \
 Z, z, ZL, zl, Zl, zL, z, z
 .ifc  \reg,\dst
-..dst = (..regno / 8) + 26
+..dst = 2 * (..regno / 8) + 26
 .endif
 .ifc  \reg,\src
-..src = (..regno / 8) + 26
+..src = 2 * (..regno / 8) + 26
 .endif
 ..regno = ..regno + 1
 .endr
diff --git a/libgcc/config/avr/libf7/libf7-asm.sx 
b/libgcc/config/avr/libf7/libf7-asm.sx

index 5df167fe73c..4505764c126 100644
--- a/libgcc/config/avr/libf7/libf7-asm.sx
+++ b/libgcc/config/avr/libf7/libf7-asm.sx
@@ -1067,6 +1067,100 @@ DEFUN mul_mant
 ENDF mul_mant
 #endif /* F7MOD_mul_mant_ && MUL */

+#if defined F7MOD_mul_mant_ && ! defined (__AVR_HAVE_MUL__)
+#define AA  TMP
+#define A0  13
+#define A1  A0+1
+#define A2  A0+2
+#define A3  A0+3
+#define A4  A0+4
+#define A5  r26
+#define A6  r27
+#define BB  ZERO
+#define Bitsr29
+#define Bytes   r28
+
+DEFUN mul_mant
+do_prologue_saves 7
+bst r18,0   ; T = 1: Don't round.
+;; Save result address for later.
+pushr25
+pushr24
+;; Load 1st operand mantissa.
+wmovr30,r22
+clr AA
+LDD A0, Z+0+Off
+LDD A1, Z+1+Off
+LDD A2, Z+2+Off
+LDD A3, Z+3+Off
+LDD A4, Z+4+Off
+LDD A5, Z+5+Off
+LDD A6, Z+6+Off
+;; Let Z point one past .mant of the 2nd input operand.
+wmovr30,r20
+adiwr30,Expo
+
+;; Clear the result mantissa.
+.global __clr_8
+XCALL   __clr_8
+
+;; Loop over the bytes of B's mantissa from highest to lowest.
+;; "+1" because we jump into the loop.
+ldi Bytes,  1 + F7_MANT_BYTES
+
+;; Divide one operand by 2 so that the result mantissa won't overflow.
+;; This is accounted for by "Carry = 1" below.
+ldi Bits,   1
+rjmp.Loop_entry
+
+.Loop_bytes:
+ld  BB, -Z
+;;  Loop over the bits of B's mantissa from highest to lowest.
+ldi Bits,   8
+.Loop_bits:
+lsl BB
+brcc.Lnext_bit
+
+ADD CA, AA
+adc C0, A0
+adc C1, A1
+adc C2, A2
+adc C3, A3
+adc C4, A4
+adc C5, A5
+adc C6, A6
+
+.Lnext_bit:
+.Loop_entry:
+LSR A6
+ror A5
+ror A4
+ror A3
+ror A2
+ror A1
+ror A0
+ror AA
+
+dec Bits
+brne.Loop_bits
+
+dec Bytes
+brne.Loop_bytes
+
+;; Finally...
+
+pop ZL
+pop ZH
+
+;; The result has to be left-shifted by one (multiplied by 2) in order
+;; to undo the division by 2 of the 1st operand.
+ldi Carry,  1
+F7call  normalize.maybe_round.store_with_flags
+
+do_epilogue_restores 7
+ENDF mul_mant
+#endif /* F7MOD_mul_mant_ && ! MUL */
+

 #if defined (F7MOD_div_)

diff --git a/libgcc/config/avr/libf7/t-libf7 
b/libgcc/config/avr/libf7/t-libf7

index 30aa280d11e..f17e67e8523 100644
--- a/libgcc/config/avr/libf7/t-libf7
+++ b/libgcc/config/avr/libf7/t-libf7
@@ -86,7 +86,7 @@ F7_C_FLAGS +=   $(F7_FLAGS) \
-fno-tree-loop-optimize \
-fno-tree-loop-im -fno-move-loop-invariants

-F7_ASM_FLAGS +=$(F7_FLAGS)
+F7_ASM_FLAGS +=$(F7_FLAGS) -g0

 $(patsubst %, f7_c_%.o, $(CALL_PROLOGUES)) \
: F7_C_FLAGS += -mcall-prologues


Re: [PATCH] c++/modules: ICE with lambda initializing local var [PR105322]

2023-10-18 Thread Patrick Palka
On Wed, 18 Oct 2023, Patrick Palka wrote:

> Bootstrapped and regtested on x86_64-pc-linux-gnu, does this look OK for
> trunk?

Note that this doesn't fix the other testcase in the PR, which doesn't use any
lambdas and which ICEs in the same way:

export module pr105322;

auto f() {
  struct A { int m; };
  return A{};
}

export
inline void g() {
  auto r = decltype(f()){0};
}

Here when streaming the CONSTRUCTOR initializer of r, we end up streaming
components of f()::A before ever streaming the declaration/definition of
f()::A.  I suspect a separate fix might be needed for this testcase?
The narrow fix for the lambda testcase still seems useful nonetheless.

> 
> -- >8 --
> 
> For a local variable initialized by a lambda:
> 
>   auto f = []{};
> 
> The corresponding BLOCK_VARS contains the variable declaration first,
> followed by the closure type declaration, consistent with the
> syntactical order.  This however means that a use of the closure type
> appears (in the variable type/initializer) before the declaration of the
> type.  This ends up causing an ICE when streaming the BLOCK_VARS of f1
> below because we stream (by value) the CONSTRUCTOR initializer of g1 --
> which contains components of the closure type -- before we've streamed
> the declaration defining the closure type.  The following comment in
> module.cc seems relevant:
> 
>   /* We want to stream the type of a expression-like nodes /after/
>  we've streamed the operands.  The type often contains (bits
>  of the) types of the operands, and with things like decltype
>  and noexcept in play, we really want to stream the decls
>  defining the type before we try and stream the type on its
>  own.  Otherwise we can find ourselves trying to read in a
>  decl, when we're already partially reading in a component of
>  its type.  And that's bad.  */
> 
> This patch narrowly fixes this issue by special casing closure type
> declarations in add_decl_to_level.  (A loop is needed since there could
> be multiple variable declarations with an unprocessed initializer in
> light of structured bindings.)
> 
>   PR c++/105322
> 
> gcc/cp/ChangeLog:
> 
>   * name-lookup.cc (add_decl_to_level): When adding a closure
>   type declaration to a block scope, add it before rather than
>   after any variable declarations whose initializer we're still
>   processing.
> 
> gcc/testsuite/ChangeLog:
> 
>   * g++.dg/modules/lambda-5_a.C: New test.
>   * g++.dg/modules/lambda-5_b.C: New test.
> ---
>  gcc/cp/name-lookup.cc | 19 ---
>  gcc/testsuite/g++.dg/modules/lambda-5_a.C | 23 +++
>  gcc/testsuite/g++.dg/modules/lambda-5_b.C | 10 ++
>  3 files changed, 49 insertions(+), 3 deletions(-)
>  create mode 100644 gcc/testsuite/g++.dg/modules/lambda-5_a.C
>  create mode 100644 gcc/testsuite/g++.dg/modules/lambda-5_b.C
> 
> diff --git a/gcc/cp/name-lookup.cc b/gcc/cp/name-lookup.cc
> index a8b9229b29e..bb00baaf9f4 100644
> --- a/gcc/cp/name-lookup.cc
> +++ b/gcc/cp/name-lookup.cc
> @@ -391,9 +391,22 @@ add_decl_to_level (cp_binding_level *b, tree decl)
>gcc_assert (b->names != decl);
>  
>/* We build up the list in reverse order, and reverse it later if
> - necessary.  */
> -  TREE_CHAIN (decl) = b->names;
> -  b->names = decl;
> + necessary.  If we're adding a lambda closure type to a block
> + scope as part of a local variable initializer, then make sure
> + we declare the type before the variable; modules expects that
> + we see a type declaration before a use of the type.  */
> +  tree *prev = >names;
> +  if (b->kind == sk_block
> +  && !processing_template_decl
> +  && TREE_CODE (decl) == TYPE_DECL
> +  && LAMBDA_TYPE_P (TREE_TYPE (decl)))
> +while (*prev && VAR_P (*prev)
> +&& !DECL_EXTERNAL (*prev)
> +&& !DECL_INITIALIZED_P (*prev))
> +  prev = _CHAIN (*prev);
> +
> +  TREE_CHAIN (decl) = *prev;
> +  *prev = decl;
>  
>/* If appropriate, add decl to separate list of statics.  We include
>   extern variables because they might turn out to be static later.
> diff --git a/gcc/testsuite/g++.dg/modules/lambda-5_a.C 
> b/gcc/testsuite/g++.dg/modules/lambda-5_a.C
> new file mode 100644
> index 000..6b54c8e3173
> --- /dev/null
> +++ b/gcc/testsuite/g++.dg/modules/lambda-5_a.C
> @@ -0,0 +1,23 @@
> +// PR c++/105322
> +// { dg-additional-options -fmodules-ts }
> +// { dg-module-cmi pr105322 }
> +
> +export module pr105322;
> +
> +struct A { };
> +
> +export
> +inline void f1() {
> +  A a;
> +  auto g1 = [a] { }; // used to ICE here during stream out
> +}
> +
> +export
> +template
> +void f2() {
> +  A a;
> +  auto g2 = [a] { };
> +}
> +
> +export
> +inline auto g3 = [a=A{}] { };
> diff --git a/gcc/testsuite/g++.dg/modules/lambda-5_b.C 
> b/gcc/testsuite/g++.dg/modules/lambda-5_b.C
> new file mode 100644
> index 000..e25a913b726
> --- 

[PATCH] c++/modules: ICE with lambda initializing local var [PR105322]

2023-10-18 Thread Patrick Palka
Bootstrapped and regtested on x86_64-pc-linux-gnu, does this look OK for
trunk?

-- >8 --

For a local variable initialized by a lambda:

  auto f = []{};

The corresponding BLOCK_VARS contains the variable declaration first,
followed by the closure type declaration, consistent with the
syntactical order.  This however means that a use of the closure type
appears (in the variable type/initializer) before the declaration of the
type.  This ends up causing an ICE when streaming the BLOCK_VARS of f1
below because we stream (by value) the CONSTRUCTOR initializer of g1 --
which contains components of the closure type -- before we've streamed
the declaration defining the closure type.  The following comment in
module.cc seems relevant:

  /* We want to stream the type of a expression-like nodes /after/
 we've streamed the operands.  The type often contains (bits
 of the) types of the operands, and with things like decltype
 and noexcept in play, we really want to stream the decls
 defining the type before we try and stream the type on its
 own.  Otherwise we can find ourselves trying to read in a
 decl, when we're already partially reading in a component of
 its type.  And that's bad.  */

This patch narrowly fixes this issue by special casing closure type
declarations in add_decl_to_level.  (A loop is needed since there could
be multiple variable declarations with an unprocessed initializer in
light of structured bindings.)

PR c++/105322

gcc/cp/ChangeLog:

* name-lookup.cc (add_decl_to_level): When adding a closure
type declaration to a block scope, add it before rather than
after any variable declarations whose initializer we're still
processing.

gcc/testsuite/ChangeLog:

* g++.dg/modules/lambda-5_a.C: New test.
* g++.dg/modules/lambda-5_b.C: New test.
---
 gcc/cp/name-lookup.cc | 19 ---
 gcc/testsuite/g++.dg/modules/lambda-5_a.C | 23 +++
 gcc/testsuite/g++.dg/modules/lambda-5_b.C | 10 ++
 3 files changed, 49 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/g++.dg/modules/lambda-5_a.C
 create mode 100644 gcc/testsuite/g++.dg/modules/lambda-5_b.C

diff --git a/gcc/cp/name-lookup.cc b/gcc/cp/name-lookup.cc
index a8b9229b29e..bb00baaf9f4 100644
--- a/gcc/cp/name-lookup.cc
+++ b/gcc/cp/name-lookup.cc
@@ -391,9 +391,22 @@ add_decl_to_level (cp_binding_level *b, tree decl)
   gcc_assert (b->names != decl);
 
   /* We build up the list in reverse order, and reverse it later if
- necessary.  */
-  TREE_CHAIN (decl) = b->names;
-  b->names = decl;
+ necessary.  If we're adding a lambda closure type to a block
+ scope as part of a local variable initializer, then make sure
+ we declare the type before the variable; modules expects that
+ we see a type declaration before a use of the type.  */
+  tree *prev = >names;
+  if (b->kind == sk_block
+  && !processing_template_decl
+  && TREE_CODE (decl) == TYPE_DECL
+  && LAMBDA_TYPE_P (TREE_TYPE (decl)))
+while (*prev && VAR_P (*prev)
+  && !DECL_EXTERNAL (*prev)
+  && !DECL_INITIALIZED_P (*prev))
+  prev = _CHAIN (*prev);
+
+  TREE_CHAIN (decl) = *prev;
+  *prev = decl;
 
   /* If appropriate, add decl to separate list of statics.  We include
  extern variables because they might turn out to be static later.
diff --git a/gcc/testsuite/g++.dg/modules/lambda-5_a.C 
b/gcc/testsuite/g++.dg/modules/lambda-5_a.C
new file mode 100644
index 000..6b54c8e3173
--- /dev/null
+++ b/gcc/testsuite/g++.dg/modules/lambda-5_a.C
@@ -0,0 +1,23 @@
+// PR c++/105322
+// { dg-additional-options -fmodules-ts }
+// { dg-module-cmi pr105322 }
+
+export module pr105322;
+
+struct A { };
+
+export
+inline void f1() {
+  A a;
+  auto g1 = [a] { }; // used to ICE here during stream out
+}
+
+export
+template
+void f2() {
+  A a;
+  auto g2 = [a] { };
+}
+
+export
+inline auto g3 = [a=A{}] { };
diff --git a/gcc/testsuite/g++.dg/modules/lambda-5_b.C 
b/gcc/testsuite/g++.dg/modules/lambda-5_b.C
new file mode 100644
index 000..e25a913b726
--- /dev/null
+++ b/gcc/testsuite/g++.dg/modules/lambda-5_b.C
@@ -0,0 +1,10 @@
+// PR c++/105322
+// { dg-additional-options -fmodules-ts }
+
+import pr105322;
+
+int main() {
+  f1();
+  f2();
+  g3();
+}
-- 
2.42.0.398.ga9ecda2788



RE: [Patch] nvptx: Use fatal_error when -march= is missing not an assert [PR111093]

2023-10-18 Thread Roger Sayle

Hi Tomas, Tobias and Tom,
Thanks for asking.  Interestingly, I've a patch (attached) from last year that
tackled some of the issues here.  The surface problem is that nvptx's march
and misa are related in complicated ways.  Specifying an arch defines the
range of valid isa's, and specifying an isa restricts the set of valid arches.

The current approach, which I agree is problematic, is to force these to
be specified (compatibly) on the cc1 command line.  Certainly, an error
is better than an abort.  My proposed solution was to allow either to 
imply a default for the other, and only issue an error if they are explicitly
specified incompatibly.

One reason for supporting this approach was to ultimately support an
-march=native in the driver (calling libcuda.so to determine the hardware
available on the current machine).

The other use case is bumping the "default" nvptx architecture to something
more recent, say sm_53, by providing/honoring a default arch at configure
time.

Alas, it turns out that specifying a recent arch during GCC bootstrap, allows
the build to notice that the backend (now) supports 16-bit floats, which then
prompts libgcc to contain the floathf and fixhf support that would be required.
Then this in turn shows up as a limitation in the middle-end's handling of 
libcalls, which I submitted as a patch to back in July 2022:
https://gcc.gnu.org/pipermail/gcc-patches/2022-July/598848.html

That patch hasn't yet been approved, so the whole nvptx -march= patch
series became backlogged/forgotten.

Hopefully, the attached "proof-of-concept" patch looks interesting (food
for thought).  If this approach seems reasonable, I'm happy to brush the
dust off, and resubmit it (or a series of pieces) for review.

Best regards,
Roger
--

> -Original Message-
> From: Thomas Schwinge 
> Sent: 18 October 2023 11:16
> To: Tobias Burnus 
> Cc: gcc-patches@gcc.gnu.org; Tom de Vries ; Roger Sayle
> 
> Subject: Re: [Patch] nvptx: Use fatal_error when -march= is missing not an 
> assert
> [PR111093]
> 
> Hi Tobias!
> 
> On 2023-10-16T11:18:45+0200, Tobias Burnus 
> wrote:
> > While mkoffload ensures that there is always a -march=, nvptx's
> > cc1 can also be run directly.
> >
> > In my case, I wanted to know which target-specific #define are
> > available; hence, I did run:
> >accel/nvptx-none/cc1 -E -dM < /dev/null which gave an ICE. After
> > some debugging, the reasons was clear (missing -march=) but somehow a
> > (fatal) error would have been nicer than an ICE + debugging.
> >
> > OK for mainline?
> 
> Yes, thanks.  I think I prefer this over hard-coding some default 
> 'ptx_isa_option' --
> but may be convinced otherwise (incremental change), if that's maybe more
> convenient for others?  (Roger?)
> 
> 
> Grüße
>  Thomas
> 
> 
> > nvptx: Use fatal_error when -march= is missing not an assert
> > [PR111093]
> >
> > gcc/ChangeLog:
> >
> >   PR target/111093
> >   * config/nvptx/nvptx.cc (nvptx_option_override): Issue fatal error
> >   instead of an assert ICE when no -march= has been specified.
> >
> > diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc
> > index edef39fb5e1..634c31673be 100644
> > --- a/gcc/config/nvptx/nvptx.cc
> > +++ b/gcc/config/nvptx/nvptx.cc
> > @@ -335,8 +335,9 @@ nvptx_option_override (void)
> >init_machine_status = nvptx_init_machine_status;
> >
> >/* Via nvptx 'OPTION_DEFAULT_SPECS', '-misa' always appears on the
> command
> > - line.  */
> > -  gcc_checking_assert (OPTION_SET_P (ptx_isa_option));
> > + line; but handle the case that the compiler is not run via the
> > + driver.  */  if (!OPTION_SET_P (ptx_isa_option))
> > +fatal_error (UNKNOWN_LOCATION, "%<-march=%> must be specified");
> >
> >handle_ptx_version_option ();
> >
> -
> Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634
> München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas
> Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht
> München, HRB 106955
diff --git a/gcc/calls.cc b/gcc/calls.cc
index 6dd6f73..8a18eae 100644
--- a/gcc/calls.cc
+++ b/gcc/calls.cc
@@ -4795,14 +4795,20 @@ emit_library_call_value_1 (int retval, rtx orgfun, rtx 
value,
   else
{
  /* Convert to the proper mode if a promotion has been active.  */
- if (GET_MODE (valreg) != outmode)
+ enum machine_mode valmode = GET_MODE (valreg);
+ if (valmode != outmode)
{
  int unsignedp = TYPE_UNSIGNED (tfom);
 
  gcc_assert (promote_function_mode (tfom, outmode, ,
 fndecl ? TREE_TYPE (fndecl) : 
fntype, 1)
- == GET_MODE (valreg));
- valreg = convert_modes (outmode, GET_MODE (valreg), valreg, 0);
+ == valmode);
+ if (SCALAR_INT_MODE_P (valmode)
+ && SCALAR_FLOAT_MODE_P (outmode)
+ && known_gt 

[3/3] WIP/RFC: Fix name mangling for target_clones

2023-10-18 Thread Andrew Carlotti
This is a partial patch to make the mangling of function version names
for target_clones match those generated using the target or
target_version attributes.  It modifies the name of function versions,
but does not yet rename the resolved symbol, resulting in a duplicate
symbol name (and an error at assembly time).


Is this sort of approach ok?  Should I create an extra target hook to be called
here, so that the target_clones mangling can be target-specific but not
necessarily the same as for target attribute versioning?


diff --git a/gcc/cgraphclones.cc b/gcc/cgraphclones.cc
index 
8af6b23d8c0306920e0fdcb3559ef047a16689f4..15672c02c6f9d6043a36bf081067f08d1ab834e5
 100644
--- a/gcc/cgraphclones.cc
+++ b/gcc/cgraphclones.cc
@@ -1033,11 +1033,6 @@ cgraph_node::create_version_clone_with_body
   else
 new_decl = copy_node (old_decl);
 
-  /* Generate a new name for the new version. */
-  tree fnname = (version_decl ? clone_function_name_numbered (old_decl, suffix)
-   : clone_function_name (old_decl, suffix));
-  DECL_NAME (new_decl) = fnname;
-  SET_DECL_ASSEMBLER_NAME (new_decl, fnname);
   SET_DECL_RTL (new_decl, NULL);
 
   DECL_VIRTUAL_P (new_decl) = 0;
@@ -1065,6 +1060,24 @@ cgraph_node::create_version_clone_with_body
return NULL;
 }
 
+  /* Generate a new name for the new version. */
+  if (version_decl)
+{
+  tree fnname = (clone_function_name_numbered (old_decl, suffix));
+  DECL_NAME (new_decl) = fnname;
+  SET_DECL_ASSEMBLER_NAME (new_decl, fnname);
+}
+  else
+{
+  /* Add target version mangling.  We assume that the target hook will
+produce the same mangled name as it would have produced if the decl
+had already been versioned when the hook was previously called.  */
+  tree fnname = DECL_ASSEMBLER_NAME (old_decl);
+  DECL_NAME (new_decl) = fnname;
+  fnname = targetm.mangle_decl_assembler_name (new_decl, fnname);
+  SET_DECL_ASSEMBLER_NAME (new_decl, fnname);
+}
+
   /* When the old decl was a con-/destructor make sure the clone isn't.  */
   DECL_STATIC_CONSTRUCTOR (new_decl) = 0;
   DECL_STATIC_DESTRUCTOR (new_decl) = 0;
diff --git a/gcc/multiple_target.cc b/gcc/multiple_target.cc
index 
3db57c2b13d612a37240d9dcf58ad21b2286633c..d9aec9a5ab532701b4a1877b440f3a553ffa28e2
 100644
--- a/gcc/multiple_target.cc
+++ b/gcc/multiple_target.cc
@@ -162,7 +162,12 @@ create_dispatcher_calls (struct cgraph_node *node)
}
 }
 
-  tree fname = clone_function_name (node->decl, "default");
+  /* Add version mangling to default decl name.  We assume that the target
+ hook will produce the same mangled name as it would have produced if the
+ decl had already been versioned when the hook was previously called.  */
+  tree fname = DECL_ASSEMBLER_NAME (node->decl);
+  DECL_NAME (node->decl) = fname;
+  fname = targetm.mangle_decl_assembler_name (node->decl, fname);
   symtab->change_decl_assembler_name (node->decl, fname);
 
   if (node->definition)


[2/3] [aarch64] Add function multiversioning support

2023-10-18 Thread Andrew Carlotti
This adds initial support for function multiversion on aarch64 using the
target_version and target_clones attributes. This mostly follows the
Beta specification in the ACLE [1], with a few diffences that remain to
be fixed:

- Symbol mangling for target_clones differs from that for target_version
  and does not match the mangling specified in the ACLE. This
  inconsistency is also present in i386 and rs6000 mangling.
- The target_clones attribute does not currently support an implicit
  "default" version.
- Unrecognised target names in a target_clones attribute should be
  ignored (with an optional warning), but currently cause an error to be
  raised instead.
- There is no option to disable function multiversioning at compile
  time.
- There is no support for function multiversioning in C, since this is
  not yet enabled in the frontend. On the other hand, this patch
  happens to enable multiversioning in Ada and D as well, using their
  existing frontend support.

This patch relies on adding functionality to libgcc, to support:
- struct { unsigned long long features; } __aarch64_cpu_features;
- void __init_cpu_features (void);
- void __init_cpu_features_resolver (unsigned long hwcap,
 const __ifunc_arg_t *arg);
This support matches the interface currently used in LLVM's compiler-rt,
and will be implemented in a future patch (which will be merged before
merging this patch).

This version of the patch incorrectly uses __init_cpu_features in the
ifunc resolvers, which could lead to invalid library calls at load time.
I will fix this to use __init_cpu_features_resolver in a future version
of the patch.

[1] 
https://github.com/ARM-software/acle/blob/main/main/acle.md#function-multi-versioning

gcc/ChangeLog:

* attribs.cc (decl_attributes): Pass attribute name to target
hook.
* config/aarch64/aarch64.cc
(aarch64_process_target_version_attr): New.
(aarch64_option_valid_attribute_p): Add check and support for
target_version attribute.
(enum CPUFeatures): New list of for bitmask positions.
(aarch64_fmv_feature_data): New.
(get_feature_bit): New.
(get_feature_mask_for_version): New.
(compare_feature_masks): New.
(aarch64_compare_version_priority): New.
(make_resolver_func): New.
(add_condition_to_bb): New.
(compare_feature_version_info): New.
(dispatch_function_versions): New.
(aarch64_generate_version_dispatcher_body): New.
(aarch64_get_function_versions_dispatcher): New.
(aarch64_common_function_versions): New.
(aarch64_mangle_decl_assembler_name): New.
(TARGET_OPTION_VALID_VERSION_ATTRIBUTE_P): New implementation.
(TARGET_OPTION_EXPANDED_CLONES_ATTRIBUTE): New implementation.
(TARGET_OPTION_FUNCTION_VERSIONS): New implementation.
(TARGET_COMPARE_VERSION_PRIORITY): New implementation.
(TARGET_GENERATE_VERSION_DISPATCHER_BODY): New implementation.
(TARGET_GET_FUNCTION_VERSIONS_DISPATCHER): New implementation.
(TARGET_MANGLE_DECL_ASSEMBLER_NAME): New implementation.


diff --git a/gcc/attribs.cc b/gcc/attribs.cc
index 
a3c4a81e8582ea4fd06b9518bf51fad7c998ddd6..cc935b502028392ebdc105f940900f01f79196a7
 100644
--- a/gcc/attribs.cc
+++ b/gcc/attribs.cc
@@ -657,7 +657,8 @@ decl_attributes (tree *node, tree attributes, int flags,
  options to the attribute((target(...))) list.  */
   if (TREE_CODE (*node) == FUNCTION_DECL
   && current_target_pragma
-  && targetm.target_option.valid_attribute_p (*node, NULL_TREE,
+  && targetm.target_option.valid_attribute_p (*node,
+ get_identifier("target"),
  current_target_pragma, 0))
 {
   tree cur_attr = lookup_attribute ("target", attributes);
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
9c3c0e705e2e6ea3b55b4a5f1e7d3360f91eb51d..ca0e2a2507ffdbf99e17b77240504bf2d175b9c0
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -19088,11 +19088,70 @@ aarch64_process_target_attr (tree args)
   return true;
 }
 
+/* Parse the tree in ARGS that contains the targeti_version attribute
+   information and update the global target options space.  */
+
+bool
+aarch64_process_target_version_attr (tree args)
+{
+  if (TREE_CODE (args) == TREE_LIST)
+{
+  if (TREE_CHAIN (args))
+   {
+ error ("attribute % has multiple values");
+ return false;
+   }
+  args = TREE_VALUE (args);
+}
+
+  if (!args || TREE_CODE (args) != STRING_CST)
+{
+  error ("attribute % argument not a string");
+  return false;
+}
+
+  const char *str = TREE_STRING_POINTER (args);
+  if (strcmp (str, "default") == 0)
+return true;
+
+  auto with_plus = std::string ("+") + str;
+  enum aarch_parse_opt_result parse_res;
+  auto isa_flags 

[1/3] Add support for target_version attribute

2023-10-18 Thread Andrew Carlotti
This patch adds support for the "target_version" attribute to the middle
end and the C++ frontend, which will be used to implement function
multiversioning in the aarch64 backend.

Note that C++ is currently the only frontend which supports
multiversioning using the "target" attribute, whereas the
"target_clones" attribute is additionally supported in C, D and Ada.
Support for the target_version attribute will be extended to C at a
later date.

Targets that currently use the "target" attribute for function
multiversioning (i.e. i386 and rs6000) are not affected by this patch.


I could have implemented the target hooks slightly differently, by reusing the
valid_attribute_p hook and adding attribute name checks to each backend
implementation (c.f. the aarch64 implementation in patch 2/3).  Would this be
preferable?

Otherwise, is this ok for master?


gcc/c-family/ChangeLog:

* c-attribs.cc (handle_target_version_attribute): New.
(c_common_attribute_table): Add target_version.
(handle_target_clones_attribute): Add conflict with
target_version attribute.

gcc/ChangeLog:

* attribs.cc (is_function_default_version): Update comment to
specify incompatibility with target_version attributes.
* cgraphclones.cc (cgraph_node::create_version_clone_with_body):
Call valid_version_attribute_p for target_version attributes.
* target.def (valid_version_attribute_p): New hook.
(expanded_clones_attribute): New hook.
* doc/tm.texi.in: Add new hooks.
* doc/tm.texi: Regenerate.
* multiple_target.cc (create_dispatcher_calls): Remove redundant
is_function_default_version check.
(expand_target_clones): Use target hook for attribute name.
* targhooks.cc (default_target_option_valid_version_attribute_p):
New.
* targhooks.h (default_target_option_valid_version_attribute_p):
New.
* tree.h (DECL_FUNCTION_VERSIONED): Update comment to include
target_version attributes.

gcc/cp/ChangeLog:

* decl2.cc (check_classfn): Update comment to include
target_version attributes.


diff --git a/gcc/attribs.cc b/gcc/attribs.cc
index 
b1300018d1e8ed8e02ded1ea721dc192a6d32a49..a3c4a81e8582ea4fd06b9518bf51fad7c998ddd6
 100644
--- a/gcc/attribs.cc
+++ b/gcc/attribs.cc
@@ -1233,8 +1233,9 @@ make_dispatcher_decl (const tree decl)
   return func_decl;  
 }
 
-/* Returns true if decl is multi-versioned and DECL is the default function,
-   that is it is not tagged with target specific optimization.  */
+/* Returns true if DECL is multi-versioned using the target attribute, and this
+   is the default version.  This function can only be used for targets that do
+   not support the "target_version" attribute.  */
 
 bool
 is_function_default_version (const tree decl)
diff --git a/gcc/c-family/c-attribs.cc b/gcc/c-family/c-attribs.cc
index 
072cfb69147bd6b314459c0bd48a0c1fb92d3e4d..1a224c036277d51ab4dc0d33a403177bd226e48a
 100644
--- a/gcc/c-family/c-attribs.cc
+++ b/gcc/c-family/c-attribs.cc
@@ -148,6 +148,7 @@ static tree handle_alloc_align_attribute (tree *, tree, 
tree, int, bool *);
 static tree handle_assume_aligned_attribute (tree *, tree, tree, int, bool *);
 static tree handle_assume_attribute (tree *, tree, tree, int, bool *);
 static tree handle_target_attribute (tree *, tree, tree, int, bool *);
+static tree handle_target_version_attribute (tree *, tree, tree, int, bool *);
 static tree handle_target_clones_attribute (tree *, tree, tree, int, bool *);
 static tree handle_optimize_attribute (tree *, tree, tree, int, bool *);
 static tree ignore_attribute (tree *, tree, tree, int, bool *);
@@ -480,6 +481,8 @@ const struct attribute_spec c_common_attribute_table[] =
  handle_error_attribute, NULL },
   { "target", 1, -1, true, false, false, false,
  handle_target_attribute, NULL },
+  { "target_version", 1, -1, true, false, false, false,
+ handle_target_version_attribute, NULL },
   { "target_clones",  1, -1, true, false, false, false,
  handle_target_clones_attribute, NULL },
   { "optimize",   1, -1, true, false, false, false,
@@ -5569,6 +5572,45 @@ handle_target_attribute (tree *node, tree name, tree 
args, int flags,
   return NULL_TREE;
 }
 
+/* Handle a "target_version" attribute.  */
+
+static tree
+handle_target_version_attribute (tree *node, tree name, tree args, int flags,
+ bool *no_add_attrs)
+{
+  /* Ensure we have a function type.  */
+  if (TREE_CODE (*node) != FUNCTION_DECL)
+{
+  warning (OPT_Wattributes, "%qE attribute ignored", name);
+  *no_add_attrs = true;
+}
+  else if (lookup_attribute ("target_clones", DECL_ATTRIBUTES (*node)))
+{
+  warning (OPT_Wattributes, "%qE attribute ignored due to conflict "
+  "with %qs 

[0/3] target_version and aarch64 function multiversioning

2023-10-18 Thread Andrew Carlotti
This series adds support for function multiversioning on aarch64.  There are a
few minor issues in patch 2/3, that I intend to fix in future versions or
follow-up patches.  I also have some open questions about the correctness of
existing function multiversioning implementations [1], that could affect some
details of this patch series.

Patches 1/3 and 2/3 both pass regression testing on x86.  Patch 2/3 requires
adding function multiversioning tests to aarch64, which I haven't included yet.
Patch 3/3 demonstrates a potential approach for improving consistency of symbol
naming between target_clones and target/target_version multiversioning, but
would require agreement on how to resolve some of the issues discussed in [1].

Thanks,
Andrew


[1] https://gcc.gnu.org/pipermail/gcc/2023-October/242686.html


Re: [V3][PATCH 0/3] New attribute "counted_by" to annotate bounds for C99 FAM(PR108896)

2023-10-18 Thread Siddhesh Poyarekar

[Sorry, I forgot to respond to this]

On 2023-10-06 16:01, Martin Uecker wrote:

Am Freitag, dem 06.10.2023 um 06:50 -0400 schrieb Siddhesh Poyarekar:

On 2023-10-06 01:11, Martin Uecker wrote:

Am Donnerstag, dem 05.10.2023 um 15:35 -0700 schrieb Kees Cook:

On Thu, Oct 05, 2023 at 04:08:52PM -0400, Siddhesh Poyarekar wrote:

2. How would you handle signedness of the size field?  The size gets
converted to sizetype everywhere it is used and overflows/underflows may
produce interesting results.  Do you want to limit the types to unsigned or
do you want to add a disclaimer in the docs?  The former seems like the
*right* thing to do given that it is a new feature; best to enforce the
cleaner habit at the outset.


The Linux kernel has a lot of "int" counters, so the goal is to catch
negative offsets just like too-large offsets at runtime with the sanitizer
and report 0 for __bdos. Refactoring all these to be unsigned is going
to take time since at least some of them use the negative values as
special values unrelated to array indexing. :(

So, perhaps if unsigned counters are worth enforcing, can this be a
separate warning the kernel can turn off initially?



I think unsigned counters are much more problematic than signed ones
because wraparound errors are more difficult to find.

With unsigned you could potentially diagnose wraparound, but only if we
add -fsanitize=unsigned-overflow *and* add mechanism to mark intentional
wraparound *and* everybody adds this annotation after carefully screening
their code *and* rewriting all operations such as (counter - 3) + 5
where the wraparound in the intermediate expression is harmless.

For this reason, I do not think we should ever enforce some rule that
the counter has to be unsigned.

What we could do, is detect *storing* negative values into the
counter at run-time using UBSan. (but if negative values are
used for special cases, one also should be able to turn this
off).


All of the object size detection relies on object sizes being sizetype.
The closest we could do with that is detect (sz != SIZE_MAX && sz >
size_t / 2), since allocators typically cannot allocate more than
SIZE_MAX / 2.


I was talking about the counter in:

struct {
   int counter;
   char buf[] __counted_by__((counter))
};

which could be checked to be positive either when stored to or
when buf is used.

And yes, we could also check the size of buf.  Not sure what is
done for VLAs now, but I guess it could be similar.


Right now all object sizes are cast to sizetype and the generated 
dynamic expressions are such that overflows will result in the computed 
object size being zero.  Non-generated expressions (like we could get 
with __counted_by__) will simply be cast; there's probably scope for 
improvement here, where we wrap that with an expression that returns 0 
if the size exceeds SIZE_MAX / 2 since that's typically the limit for 
allocators.  We use that heuristic elsewhere in the __bos/__bdos logic too.


Thanks,
Sid


Re: [V3][PATCH 1/3] Provide counted_by attribute to flexible array member field (PR108896)

2023-10-18 Thread Qing Zhao


> On Oct 18, 2023, at 11:18 AM, Siddhesh Poyarekar  wrote:
> 
> On 2023-10-18 10:51, Qing Zhao wrote:
> +   member FIELD_DECL is a valid field of the containing structure's 
> fieldlist,
> +   FIELDLIST, Report error and remove this attribute when it's not.  */
> +static void
> +verify_counted_by_attribute (tree fieldlist, tree field_decl)
> +{
> +  tree attr_counted_by = lookup_attribute ("counted_by",
> +   DECL_ATTRIBUTES (field_decl));
> +
> +  if (!attr_counted_by)
> +return;
> +
> +  /* If there is an counted_by attribute attached to the field,
> + verify it.  */
> +
> +  const char *fieldname
> += IDENTIFIER_POINTER (TREE_VALUE (TREE_VALUE (attr_counted_by)));
> +
> +  /* Verify the argument of the attrbute is a valid field of the
 s/attrbute/attribute/
> + containing structure.  */
> +
> +  tree counted_by_field = get_named_field (fieldlist, fieldname);
> +
> +  /* Error when the field is not found in the containing structure.  */
> +  if (!counted_by_field)
> +{
> +  error_at (DECL_SOURCE_LOCATION (field_decl),
> +"%qE attribute argument not a field declaration"
> +" in the same structure, ignore it",
> +(get_attribute_name (attr_counted_by)));
 Probably someone with English as a first language would make a better 
 suggestion, but how about:
   Argument specified in %qE attribute is not a field declaration in the
   same structure, ignoring it.
> +
> +  DECL_ATTRIBUTES (field_decl)
> += remove_attribute ("counted_by", DECL_ATTRIBUTES (field_decl));
> +}
> +  else
> +  /* Error when the field is not with an integer type.  */
 Suggest: Flag an error when the field is not of an integer type.
> +{
> +  while (TREE_CHAIN (counted_by_field))
> +counted_by_field = TREE_CHAIN (counted_by_field);
> +  tree real_field = TREE_VALUE (counted_by_field);
> +
> +  if (TREE_CODE (TREE_TYPE (real_field)) != INTEGER_TYPE)
> +{
> +  error_at (DECL_SOURCE_LOCATION (field_decl),
> + "%qE attribute argument not a field declaration"
> + " with integer type, ignore it",
> + (get_attribute_name (attr_counted_by)));
 Suggest:
   Argument specified in %qE attribute is not of an integer type,
   ignoring it.
> +
> +  DECL_ATTRIBUTES (field_decl)
> += remove_attribute ("counted_by", DECL_ATTRIBUTES (field_decl));
> +}
> +}
> +
> +  return;
>>> 
>>> I forgot to mention the redundant return here.
>> Could you please clarify a little bit here, why the return here is redundant?
> 
> It's the last line in the function, so even without that statement the 
> function will return.
Oh, I see. -:)
Actually,I always put an explicit return  there even though it’s the last line 
and return implicitly. 

Qing

> 
> Thanks,
> Sid



aarch64: Replace duplicated selftests

2023-10-18 Thread Andrew Carlotti
Pushed as obvious.

gcc/ChangeLog:

* config/aarch64/aarch64.cc (aarch64_test_fractional_cost):
Test <= instead of testing < twice.


diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
2b0de7ca0389be6698c329b54f9501b8ec09183f..9c3c0e705e2e6ea3b55b4a5f1e7d3360f91eb51d
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -27529,18 +27529,18 @@ aarch64_test_fractional_cost ()
   ASSERT_EQ (cf (2, 3) * 5, cf (10, 3));
   ASSERT_EQ (14 * cf (11, 21), cf (22, 3));
 
-  ASSERT_TRUE (cf (4, 15) < cf (5, 15));
-  ASSERT_FALSE (cf (5, 15) < cf (5, 15));
-  ASSERT_FALSE (cf (6, 15) < cf (5, 15));
-  ASSERT_TRUE (cf (1, 3) < cf (2, 5));
-  ASSERT_TRUE (cf (1, 12) < cf (1, 6));
-  ASSERT_FALSE (cf (5, 3) < cf (5, 3));
-  ASSERT_TRUE (cf (239, 240) < 1);
-  ASSERT_FALSE (cf (240, 240) < 1);
-  ASSERT_FALSE (cf (241, 240) < 1);
-  ASSERT_FALSE (2 < cf (207, 104));
-  ASSERT_FALSE (2 < cf (208, 104));
-  ASSERT_TRUE (2 < cf (209, 104));
+  ASSERT_TRUE (cf (4, 15) <= cf (5, 15));
+  ASSERT_TRUE (cf (5, 15) <= cf (5, 15));
+  ASSERT_FALSE (cf (6, 15) <= cf (5, 15));
+  ASSERT_TRUE (cf (1, 3) <= cf (2, 5));
+  ASSERT_TRUE (cf (1, 12) <= cf (1, 6));
+  ASSERT_TRUE (cf (5, 3) <= cf (5, 3));
+  ASSERT_TRUE (cf (239, 240) <= 1);
+  ASSERT_TRUE (cf (240, 240) <= 1);
+  ASSERT_FALSE (cf (241, 240) <= 1);
+  ASSERT_FALSE (2 <= cf (207, 104));
+  ASSERT_TRUE (2 <= cf (208, 104));
+  ASSERT_TRUE (2 <= cf (209, 104));
 
   ASSERT_TRUE (cf (4, 15) < cf (5, 15));
   ASSERT_FALSE (cf (5, 15) < cf (5, 15));


Re: [V3][PATCH 1/3] Provide counted_by attribute to flexible array member field (PR108896)

2023-10-18 Thread Siddhesh Poyarekar

On 2023-10-18 10:51, Qing Zhao wrote:



+   member FIELD_DECL is a valid field of the containing structure's fieldlist,
+   FIELDLIST, Report error and remove this attribute when it's not.  */
+static void
+verify_counted_by_attribute (tree fieldlist, tree field_decl)
+{
+  tree attr_counted_by = lookup_attribute ("counted_by",
+   DECL_ATTRIBUTES (field_decl));
+
+  if (!attr_counted_by)
+return;
+
+  /* If there is an counted_by attribute attached to the field,
+ verify it.  */
+
+  const char *fieldname
+= IDENTIFIER_POINTER (TREE_VALUE (TREE_VALUE (attr_counted_by)));
+
+  /* Verify the argument of the attrbute is a valid field of the

s/attrbute/attribute/

+ containing structure.  */
+
+  tree counted_by_field = get_named_field (fieldlist, fieldname);
+
+  /* Error when the field is not found in the containing structure.  */
+  if (!counted_by_field)
+{
+  error_at (DECL_SOURCE_LOCATION (field_decl),
+"%qE attribute argument not a field declaration"
+" in the same structure, ignore it",
+(get_attribute_name (attr_counted_by)));

Probably someone with English as a first language would make a better 
suggestion, but how about:
   Argument specified in %qE attribute is not a field declaration in the
   same structure, ignoring it.

+
+  DECL_ATTRIBUTES (field_decl)
+= remove_attribute ("counted_by", DECL_ATTRIBUTES (field_decl));
+}
+  else
+  /* Error when the field is not with an integer type.  */

Suggest: Flag an error when the field is not of an integer type.

+{
+  while (TREE_CHAIN (counted_by_field))
+counted_by_field = TREE_CHAIN (counted_by_field);
+  tree real_field = TREE_VALUE (counted_by_field);
+
+  if (TREE_CODE (TREE_TYPE (real_field)) != INTEGER_TYPE)
+{
+  error_at (DECL_SOURCE_LOCATION (field_decl),
+ "%qE attribute argument not a field declaration"
+ " with integer type, ignore it",
+ (get_attribute_name (attr_counted_by)));

Suggest:
   Argument specified in %qE attribute is not of an integer type,
   ignoring it.

+
+  DECL_ATTRIBUTES (field_decl)
+= remove_attribute ("counted_by", DECL_ATTRIBUTES (field_decl));
+}
+}
+
+  return;


I forgot to mention the redundant return here.


Could you please clarify a little bit here, why the return here is redundant?


It's the last line in the function, so even without that statement the 
function will return.


Thanks,
Sid


Re: [PATCH] vect: Allow same precision for bit-precision conversions.

2023-10-18 Thread Richard Biener



> Am 18.10.2023 um 16:19 schrieb Robin Dapp :
> 
> Hi,
> 
> even though there was no full conclusion yet I took the liberty of
> just posting this as a patch in case of further discussion.
> 
> In PR/111794 we miss a vectorization because on riscv type precision and
> mode precision differ for mask types.  We can still vectorize when
> allowing assignments with the same precision for dest and source which
> is what this patch does.
> 
> Bootstrapped and regtested on x86, aarch64 and power10.  No new failures
> on riscv.

It looks safe, thus OK.

Richard.

> Regards
> Robin
> 
> gcc/ChangeLog:
> 
>PR/111794
> 
>* tree-vect-stmts.cc (vectorizable_assignment): Add
>same-precision exception for dest and source.
> 
> gcc/testsuite/ChangeLog:
> 
>* gcc.target/riscv/rvv/autovec/slp-mask-1.c: New test.
>* gcc.target/riscv/rvv/autovec/slp-mask-run-1.c: New test.
> ---
> .../gcc.target/riscv/rvv/autovec/slp-mask-1.c | 18 +++
> .../riscv/rvv/autovec/slp-mask-run-1.c| 31 +++
> gcc/tree-vect-stmts.cc| 12 ---
> 3 files changed, 56 insertions(+), 5 deletions(-)
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/slp-mask-1.c
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/slp-mask-run-1.c
> 
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/slp-mask-1.c 
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/slp-mask-1.c
> new file mode 100644
> index 000..ee1baa58d63
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/slp-mask-1.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-std=gnu99 -O3 -march=rv64gcv -mabi=lp64d 
> --param=riscv-autovec-preference=scalable -fdump-tree-slp-details" } */
> +
> +void
> +__attribute__ ((noipa))
> +f (int *restrict x, short *restrict y, int *restrict res)
> +{
> +  res[0] = x[0] == 1 & y[0] == 2;
> +  res[1] = x[1] == 1 & y[1] == 2;
> +  res[2] = x[2] == 1 & y[2] == 2;
> +  res[3] = x[3] == 1 & y[3] == 2;
> +  res[4] = x[4] == 1 & y[4] == 2;
> +  res[5] = x[5] == 1 & y[5] == 2;
> +  res[6] = x[6] == 1 & y[6] == 2;
> +  res[7] = x[7] == 1 & y[7] == 2;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "slp2" 
> } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/slp-mask-run-1.c 
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/slp-mask-run-1.c
> new file mode 100644
> index 000..b3469c41c87
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/slp-mask-run-1.c
> @@ -0,0 +1,31 @@
> +/* { dg-do run { target { riscv_v } } } */
> +/* { dg-additional-options "-std=gnu99 -O3 -march=rv64gcv -mabi=lp64d 
> --param=riscv-autovec-preference=scalable" } */
> +
> +#include 
> +#include 
> +
> +#include "slp-mask-1.c"
> +
> +#define SZ 8
> +
> +__attribute__ ((optimize ("1")))
> +int main ()
> +{
> +  int *a = malloc (SZ * sizeof (*a));
> +  short *b = malloc (SZ * sizeof (*b));
> +  int *res = malloc (SZ * sizeof (*res));
> +  int *ref = malloc (SZ * sizeof (*ref));
> +
> +  for (int i = 0; i < SZ; i++)
> +{
> +  a[i] = i & 1;
> +  b[i] = 2;
> +  ref[i] = a[i] == 1 & b[i] == 2;
> +}
> +
> +  f (a, b, res);
> +
> +  for (int i = 0; i < SZ; i++)
> +if (res[i] != ref[i])
> +  __builtin_abort ();
> +}
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index cd7c1090d88..e612da6c492 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -6084,14 +6084,16 @@ vectorizable_assignment (vec_info *vinfo,
>   /* But a conversion that does not change the bit-pattern is ok.  */
>   && !(INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
>   && INTEGRAL_TYPE_P (TREE_TYPE (op))
> -   && (TYPE_PRECISION (TREE_TYPE (scalar_dest))
> +   && (((TYPE_PRECISION (TREE_TYPE (scalar_dest))
>   > TYPE_PRECISION (TREE_TYPE (op)))
> -   && TYPE_UNSIGNED (TREE_TYPE (op
> + && TYPE_UNSIGNED (TREE_TYPE (op)))
> +   || (TYPE_PRECISION (TREE_TYPE (scalar_dest))
> +   == TYPE_PRECISION (TREE_TYPE (op))
> {
>   if (dump_enabled_p ())
> -dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> - "type conversion to/from bit-precision "
> - "unsupported.\n");
> +dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> + "type conversion to/from bit-precision "
> + "unsupported.\n");
>   return false;
> }
> 
> -- 
> 2.41.0
> 


[PATCH V2 3/7] aarch64: Implement system register validation tools

2023-10-18 Thread Victor Do Nascimento
Given the implementation of a mechanism of encoding system registers
into GCC, this patch provides the mechanism of validating their use by
the compiler.  In particular, this involves:

  1. Ensuring a supplied string corresponds to a known system
 register name.  System registers can be accessed either via their
 name (e.g. `SPSR_EL1') or their encoding (e.g. `S3_0_C4_C0_0').
 Register names are validated using a hash map, mapping known
 system register names to its corresponding `sysreg_t' struct,
 which is populated from the `aarch64_system_regs.def' file.
 Register name validation is done via `lookup_sysreg_map', while
 the encoding naming convention is validated via a parser
 implemented in this patch - `is_implem_def_reg'.
  2. Once a given register name is deemed to be valid, it is checked
 against a further 2 criteria:
   a. Is the referenced register implemented in the target
  architecture?  This is achieved by comparing the ARCH field
  in the relevant SYSREG entry from `aarch64_system_regs.def'
  against `aarch64_feature_flags' flags set at compile-time.
   b. Is the register being used correctly?  Check the requested
  operation against the FLAGS specified in SYSREG.
  This prevents operations like writing to a read-only system
  register.

gcc/ChangeLog:

* gcc/config/aarch64/aarch64-protos.h (aarch64_valid_sysreg_name_p): 
New.
(aarch64_retrieve_sysreg): Likewise.
* gcc/config/aarch64/aarch64.cc (is_implem_def_reg): Likewise.
(aarch64_valid_sysreg_name_p): Likewise.
(aarch64_retrieve_sysreg): Likewise.
(aarch64_register_sysreg): Likewise.
(aarch64_init_sysregs): Likewise.
(aarch64_lookup_sysreg_map): Likewise.
* gcc/config/aarch64/predicates.md (aarch64_sysreg_string): New.
---
 gcc/config/aarch64/aarch64-protos.h |   2 +
 gcc/config/aarch64/aarch64.cc   | 146 
 gcc/config/aarch64/predicates.md|   4 +
 3 files changed, 152 insertions(+)

diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index 60a55f4bc19..a134e2fcf8e 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -830,6 +830,8 @@ bool aarch64_simd_shift_imm_p (rtx, machine_mode, bool);
 bool aarch64_sve_ptrue_svpattern_p (rtx, struct simd_immediate_info *);
 bool aarch64_simd_valid_immediate (rtx, struct simd_immediate_info *,
enum simd_immediate_check w = AARCH64_CHECK_MOV);
+bool aarch64_valid_sysreg_name_p (const char *);
+const char *aarch64_retrieve_sysreg (char *, bool);
 rtx aarch64_check_zero_based_sve_index_immediate (rtx);
 bool aarch64_sve_index_immediate_p (rtx);
 bool aarch64_sve_arith_immediate_p (machine_mode, rtx, bool);
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 69de2366424..816c4b69fc8 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -85,6 +85,7 @@
 #include "config/arm/aarch-common.h"
 #include "config/arm/aarch-common-protos.h"
 #include "ssa.h"
+#include "hash-map.h"
 
 /* This file should be included last.  */
 #include "target-def.h"
@@ -2845,6 +2846,52 @@ const sysreg_t sysreg_structs[] =
 const unsigned nsysreg = TOTAL_ITEMS;
 #undef TOTAL_ITEMS
 
+using sysreg_map_t = hash_map;
+static sysreg_map_t *sysreg_map = nullptr;
+
+/* Map system register names to their hardware metadata: Encoding,
+   feature flags and architectural feature requirements, all of which
+   are encoded in a sysreg_t struct.  */
+void
+aarch64_register_sysreg (const char *name, const sysreg_t *metadata)
+{
+  bool dup = sysreg_map->put (name, metadata);
+  gcc_checking_assert (!dup);
+}
+
+/* Lazily initialize hash table for system register validation,
+   checking the validity of supplied register name and returning
+   register's associated metadata.  */
+static void
+aarch64_init_sysregs (void)
+{
+  gcc_assert (!sysreg_map);
+  sysreg_map = new sysreg_map_t;
+  gcc_assert (sysreg_map);
+
+  for (unsigned i = 0; i < nsysreg; i++)
+{
+  const sysreg_t *reg = sysreg_structs + i;
+  aarch64_register_sysreg (reg->name , reg);
+}
+}
+
+/* No direct access to the sysreg hash-map should be made.  Doing so
+   risks trying to acess an unitialized hash-map and dereferencing the
+   returned double pointer without due care risks dereferencing a
+   null-pointer.  */
+const sysreg_t *
+aarch64_lookup_sysreg_map (const char *regname)
+{
+  if (!sysreg_map)
+aarch64_init_sysregs ();
+
+  const sysreg_t **sysreg_entry = sysreg_map->get (regname);
+  if (sysreg_entry != NULL)
+return *sysreg_entry;
+  return NULL;
+}
+
 /* The current tuning set.  */
 struct tune_params aarch64_tune_params = generic_tunings;
 
@@ -28053,6 +28100,105 @@ aarch64_pars_overlap_p (rtx par1, rtx par2)
   return false;
 }
 
+/* Parse an implementation-defined system register name of

[PATCH V2 6/7] aarch64: Add front-end argument type checking for target builtins

2023-10-18 Thread Victor Do Nascimento
In implementing the ACLE read/write system register builtins it was
observed that leaving argument type checking to be done at expand-time
meant that poorly-formed function calls were being "fixed" by certain
optimization passes, meaning bad code wasn't being properly picked up
in checking.

Example:

  const char *regname = "amcgcr_el0";
  long long a = __builtin_aarch64_rsr64 (regname);

is reduced by the ccp1 pass to

  long long a = __builtin_aarch64_rsr64 ("amcgcr_el0");

As these functions require an argument of STRING_CST type, there needs
to be a check carried out by the front-end capable of picking this up.

The introduced `check_general_builtin_call' function will be called by
the TARGET_CHECK_BUILTIN_CALL hook whenever a call to a builtin
belonging to the AARCH64_BUILTIN_GENERAL category is encountered,
carrying out any appropriate checks associated with a particular
builtin function code.

gcc/ChangeLog:

* gcc/config/aarch64/aarch64-builtins.cc (check_general_builtin_call):
New.
* gcc/config/aarch64/aarch64-c.cc (aarch64_check_builtin_call):
Add check_general_builtin_call call.
* gcc/config/aarch64/aarch64-protos.h (check_general_builtin_call):
New.

gcc/testsuite/ChangeLog:

* gcc/testsuite/gcc.target/aarch64/acle/rwsr-2.c: New.
---
 gcc/config/aarch64/aarch64-builtins.cc| 33 +++
 gcc/config/aarch64/aarch64-c.cc   |  4 +--
 gcc/config/aarch64/aarch64-protos.h   |  3 ++
 .../gcc.target/aarch64/acle/rwsr-2.c  | 15 +
 4 files changed, 53 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/acle/rwsr-2.c

diff --git a/gcc/config/aarch64/aarch64-builtins.cc 
b/gcc/config/aarch64/aarch64-builtins.cc
index d8bb2a989a5..6734361f4f4 100644
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -2126,6 +2126,39 @@ aarch64_general_builtin_decl (unsigned code, bool)
   return aarch64_builtin_decls[code];
 }
 
+bool
+check_general_builtin_call (location_t location, vec,
+   unsigned int code, tree fndecl,
+   unsigned int nargs ATTRIBUTE_UNUSED, tree *args)
+{
+  switch (code)
+{
+case AARCH64_RSR:
+case AARCH64_RSRP:
+case AARCH64_RSR64:
+case AARCH64_RSRF:
+case AARCH64_RSRF64:
+case AARCH64_WSR:
+case AARCH64_WSRP:
+case AARCH64_WSR64:
+case AARCH64_WSRF:
+case AARCH64_WSRF64:
+  if (TREE_CODE (args[0]) == VAR_DECL
+ || TREE_CODE (TREE_TYPE (args[0])) != POINTER_TYPE
+ || TREE_CODE (TREE_OPERAND (TREE_OPERAND (args[0], 0) , 0))
+ != STRING_CST)
+   {
+ const char  *fn_name, *err_msg;
+ fn_name = IDENTIFIER_POINTER (DECL_NAME (fndecl));
+ err_msg = "first argument to %<%s%> must be a string literal";
+ error_at (location, err_msg, fn_name);
+ return false;
+   }
+}
+  /* Default behavior.  */
+  return true;
+}
+
 typedef enum
 {
   SIMD_ARG_COPY_TO_REG,
diff --git a/gcc/config/aarch64/aarch64-c.cc b/gcc/config/aarch64/aarch64-c.cc
index ab8844f6049..c2a9a59df73 100644
--- a/gcc/config/aarch64/aarch64-c.cc
+++ b/gcc/config/aarch64/aarch64-c.cc
@@ -339,8 +339,8 @@ aarch64_check_builtin_call (location_t loc, vec 
arg_loc,
   switch (code & AARCH64_BUILTIN_CLASS)
 {
 case AARCH64_BUILTIN_GENERAL:
-  return true;
-
+  return check_general_builtin_call (loc, arg_loc, subcode, orig_fndecl,
+nargs, args);
 case AARCH64_BUILTIN_SVE:
   return aarch64_sve::check_builtin_call (loc, arg_loc, subcode,
  orig_fndecl, nargs, args);
diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index a134e2fcf8e..9ef96ff511f 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -990,6 +990,9 @@ tree aarch64_general_builtin_rsqrt (unsigned int);
 void handle_arm_acle_h (void);
 void handle_arm_neon_h (void);
 
+bool check_general_builtin_call (location_t, vec, unsigned int,
+ tree, unsigned int, tree *);
+
 namespace aarch64_sve {
   void init_builtins ();
   void handle_arm_sve_h ();
diff --git a/gcc/testsuite/gcc.target/aarch64/acle/rwsr-2.c 
b/gcc/testsuite/gcc.target/aarch64/acle/rwsr-2.c
new file mode 100644
index 000..72e5fb75b21
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/acle/rwsr-2.c
@@ -0,0 +1,15 @@
+/* Test the __arm_[r,w]sr ACLE intrinsics family.  */
+/* Ensure that illegal behavior is rejected by the compiler.  */
+
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=armv8.4-a" } */
+
+#include 
+
+void
+test_non_const_sysreg_name ()
+{
+  const char *regname = "trcseqstr";
+  long long a = __arm_rsr64 (regname); /* { dg-error "first argument to 
'__builtin_aarch64_rsr64' must be a string literal" } */
+  __arm_wsr64 (regname, 

[PATCH V2 2/7] aarch64: Add support for aarch64-sys-regs.def

2023-10-18 Thread Victor Do Nascimento
This patch defines the structure of a new .def file used for
representing the aarch64 system registers, what information it should
hold and the basic framework in GCC to process this file.

Entries in the aarch64-system-regs.def file should be as follows:

  SYSREG (NAME, CPENC (sn,op1,cn,cm,op2), FLAG1 | ... | FLAGn, ARCH)

Where the arguments to SYSREG correspond to:
  - NAME:  The system register name, as used in the assembly language.
  - CPENC: The system register encoding, mapping to:

   s__c_c_

  - FLAG: The entries in the FLAGS field are bitwise-OR'd together to
  encode extra information required to ensure proper use of
  the system register.  For example, a read-only system
  register will have the flag F_REG_READ, while write-only
  registers will be labeled F_REG_WRITE.  Such flags are
  tested against at compile-time.
  - ARCH: The architectural features the system register is associated
  with.  This is encoded via one of three possible macros:
  1. When a system register is universally implemented, we say
  it has no feature requirements, so we tag it with the
  AARCH64_NO_FEATURES macro.
  2. When a register is only implemented for a single
  architectural extension EXT, the AARCH64_FEATURE (EXT), is
  used.
  3. When a given system register is made available by any of N
  possible architectural extensions, the AARCH64_FEATURES(N, ...)
  macro is used to combine them accordingly.

In order to enable proper interpretation of the SYSREG entries by the
compiler, flags defining system register behavior such as `F_REG_READ'
and `F_REG_WRITE' are also defined here, so they can later be used for
the validation of system register properties.

Finally, any architectural feature flags from Binutils missing from GCC
have appropriate aliases defined here so as to ensure
cross-compatibility of SYSREG entries across the toolchain.

gcc/ChangeLog:

* gcc/config/aarch64/aarch64.cc (sysreg_t): New.
(sysreg_structs): Likewise.
(nsysreg): Likewise.
(AARCH64_FEATURE): Likewise.
(AARCH64_FEATURES): Likewise.
(AARCH64_NO_FEATURES): Likewise.
* gcc/config/aarch64/aarch64.h (AARCH64_ISA_V8A): Add missing
ISA flag.
(AARCH64_ISA_V8_1A): Likewise.
(AARCH64_ISA_V8_7A): Likewise.
(AARCH64_ISA_V8_8A): Likewise.
(AARCH64_NO_FEATURES): Likewise.
(AARCH64_FL_RAS): New ISA flag alias.
(AARCH64_FL_LOR): Likewise.
(AARCH64_FL_PAN): Likewise.
(AARCH64_FL_AMU): Likewise.
(AARCH64_FL_SCXTNUM): Likewise.
(AARCH64_FL_ID_PFR2): Likewise.
(F_DEPRECATED): New.
(F_REG_READ): Likewise.
(F_REG_WRITE): Likewise.
(F_ARCHEXT): Likewise.
(F_REG_ALIAS): Likewise.
---
 gcc/config/aarch64/aarch64.cc | 38 +++
 gcc/config/aarch64/aarch64.h  | 36 +
 2 files changed, 74 insertions(+)

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 9fbfc548a89..69de2366424 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -2807,6 +2807,44 @@ static const struct processor all_cores[] =
   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, NULL}
 };
 
+typedef struct {
+  const char* name;
+  const char* encoding;
+  const unsigned properties;
+  const unsigned long long arch_reqs;
+} sysreg_t;
+
+/* An aarch64_feature_set initializer for a single feature,
+   AARCH64_FEATURE_.  */
+#define AARCH64_FEATURE(FEAT) AARCH64_FL_##FEAT
+
+/* Used by AARCH64_FEATURES.  */
+#define AARCH64_OR_FEATURES_1(X, F1) \
+  AARCH64_FEATURE (F1)
+#define AARCH64_OR_FEATURES_2(X, F1, F2) \
+  (AARCH64_FEATURE (F1) | AARCH64_OR_FEATURES_1 (X, F2))
+#define AARCH64_OR_FEATURES_3(X, F1, ...) \
+  (AARCH64_FEATURE (F1) | AARCH64_OR_FEATURES_2 (X, __VA_ARGS__))
+
+/* An aarch64_feature_set initializer for the N features listed in "...".  */
+#define AARCH64_FEATURES(N, ...) \
+  AARCH64_OR_FEATURES_##N (0, __VA_ARGS__)
+
+/* Database of system registers, their encodings and architectural
+   requirements.  */
+const sysreg_t sysreg_structs[] =
+{
+#define CPENC(SN, OP1, CN, CM, OP2) "s"#SN"_"#OP1"_c"#CN"_c"#CM"_"#OP2
+#define SYSREG(NAME, ENC, FLAGS, ARCH) \
+  { NAME, ENC, FLAGS, ARCH },
+#include "aarch64-sys-regs.def"
+#undef CPENC
+};
+
+#define TOTAL_ITEMS (sizeof sysreg_structs / sizeof sysreg_structs[0])
+const unsigned nsysreg = TOTAL_ITEMS;
+#undef TOTAL_ITEMS
+
 /* The current tuning set.  */
 struct tune_params aarch64_tune_params = generic_tunings;
 
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index d74e9116fc5..cf3969a11aa 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -179,6 +179,8 @@ enum class aarch64_feature : unsigned char {
 
 /* Macros to test ISA flags.  */
 

[PATCH V2 7/7] aarch64: Add system register duplication check selftest

2023-10-18 Thread Victor Do Nascimento
Add a build-time test to check whether system register data, as
imported from `aarch64-sys-reg.def' has any duplicate entries.

Duplicate entries are defined as any two SYSREG entries in the .def
file which share the same encoding values (as specified by its `CPENC'
field) and where the relationship amongst the two does not fit into
one of the following categories:

* Simple aliasing: In some cases, it is observed that one
register name serves as an alias to another.  One example of
this is where TRCEXTINSELR aliases TRCEXTINSELR0.
* Expressing intent: It is possible that when a given register
serves two distinct functions depending on how it is used, it
is given two distinct names whose use should match the context
under which it is being used.  Example:  Debug Data Transfer
Register. When used to receive data, it should be accessed as
DBGDTRRX_EL0 while when transmitting data it should be
accessed via DBGDTRTX_EL0.
* Register depreciation: Some register names have been
deprecated and should no longer be used, but backwards-
compatibility requires that such names continue to be
recognized, as is the case for the SPSR_EL1 register, whose
access via the SPSR_SVC name is now deprecated.
* Same encoding different target: Some encodings are given
different meaning depending on the target architecture and, as
such, are given different names in each of theses contexts.
We see an example of this for CPENC(3,4,2,0,0), which
corresponds to TTBR0_EL2 for Armv8-A targets and VSCTLR_EL2
in Armv8-R targets.

A consequence of these observations is that `CPENC' duplication is
acceptable iff at least one of the `properties' or `arch_reqs' fields
of the `sysreg_t' structs associated with the two registers in
question differ and it's this condition that is checked by the new
`aarch64_test_sysreg_encoding_clashes' function.

gcc/ChangeLog:

* gcc/config/aarch64/aarch64.cc
(aarch64_test_sysreg_encoding_clashes): New.
(aarch64_run_selftests): add call to
aarch64_test_sysreg_encoding_clashes selftest.
---
 gcc/config/aarch64/aarch64.cc | 53 +++
 1 file changed, 53 insertions(+)

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index d187e171beb..e0be2877ede 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -22,6 +22,7 @@
 
 #define INCLUDE_STRING
 #define INCLUDE_ALGORITHM
+#define INCLUDE_VECTOR
 #include "config.h"
 #include "system.h"
 #include "coretypes.h"
@@ -28332,6 +28333,57 @@ aarch64_test_fractional_cost ()
   ASSERT_EQ (cf (1, 2).as_double (), 0.5);
 }
 
+/* Calculate whether our system register data, as imported from
+   `aarch64-sys-reg.def' has any duplicate entries.  */
+static void
+aarch64_test_sysreg_encoding_clashes (void)
+{
+  using dup_counters_t = hash_map;
+  using dup_instances_t = hash_map>;
+
+  dup_counters_t duplicate_counts;
+  dup_instances_t duplicate_instances;
+
+  /* Every time an encoding is established to come up more than once
+  we add it to a "clash-analysis queue", which is then used to extract
+  necessary information from our hash map when establishing whether
+  repeated encodings are valid.  */
+
+  /* 1) Collect recurrence information.  */
+  std::vector testqueue;
+
+  for (unsigned i = 0; i < nsysreg; i++)
+{
+  const sysreg_t *reg = sysreg_structs + i;
+
+  unsigned *tbl_entry = _counts.get_or_insert (reg->encoding);
+  *tbl_entry += 1;
+
+  std::vector *tmp
+   = _instances.get_or_insert (reg->encoding);
+
+  tmp->push_back (reg);
+  if (*tbl_entry > 1)
+ testqueue.push_back (reg->encoding);
+}
+
+  /* 2) Carry out analysis on collected data.  */
+  for (auto enc : testqueue)
+{
+  unsigned nrep = *duplicate_counts.get (enc);
+  for (unsigned i = 0; i < nrep; i++)
+   for (unsigned j = i+1; j < nrep; j++)
+ {
+   std::vector *tmp2 = duplicate_instances.get (enc);
+   const sysreg_t *a = (*tmp2)[i];
+   const sysreg_t *b = (*tmp2)[j];
+   ASSERT_TRUE ((a->properties != b->properties)
+|| (a->arch_reqs != b->arch_reqs));
+ }
+}
+}
+
 /* Run all target-specific selftests.  */
 
 static void
@@ -28339,6 +28391,7 @@ aarch64_run_selftests (void)
 {
   aarch64_test_loading_full_dump ();
   aarch64_test_fractional_cost ();
+  aarch64_test_sysreg_encoding_clashes ();
 }
 
 } // namespace selftest
-- 
2.41.0



[PATCH V2 1/7] aarch64: Sync system register information with Binutils

2023-10-18 Thread Victor Do Nascimento
This patch adds the `aarch64-sys-regs.def' file, originally written
for Binutils, to GCC. In so doing, it provides GCC with the necessary
information for teaching the compiler about system registers known to
the assembler and how these can be used.

By aligning the representation of data common to different parts of
the toolchain we can greatly reduce the duplication of work,
facilitating the maintenance of the aarch64 back-end across different
parts of the toolchain; By keeping both copies of the file in sync,
any `SYSREG (...)' that is added in one project is automatically added
to its counterpart.  This being the case, no change should be made in
the GCC copy of the file.  Any modifications should first be made in
Binutils and the resulting file copied over to GCC.

GCC does not implement the full range of ISA flags present in
Binutils.  Where this is the case, aliases must be added to aarch64.h
with the unknown architectural extension being mapped to its
associated base architecture, such that any flag present in Binutils
and used in system register definitions is understood in GCC.  Again,
this is done such that flags can be used interchangeably between
projects making use of the aarch64-system-regs.def file.  This is done
in the next patch in the series.

`.arch' directives missing from the emitted assembly files as a
consequence of this aliasing are accounted for by the compiler using
the S encoding of system registers when
issuing mrs/msr instructions.  This design choice ensures the
assembler will accept anything that was deemed acceptable by the
compiler.

gcc/ChangeLog:

* gcc/config/aarch64/aarch64-system-regs.def: New.
---
 gcc/config/aarch64/aarch64-sys-regs.def | 1064 +++
 1 file changed, 1064 insertions(+)
 create mode 100644 gcc/config/aarch64/aarch64-sys-regs.def

diff --git a/gcc/config/aarch64/aarch64-sys-regs.def 
b/gcc/config/aarch64/aarch64-sys-regs.def
new file mode 100644
index 000..d24a2455503
--- /dev/null
+++ b/gcc/config/aarch64/aarch64-sys-regs.def
@@ -0,0 +1,1064 @@
+/* aarch64-system-regs.def -- AArch64 opcode support.
+   Copyright (C) 2009-2023 Free Software Foundation, Inc.
+   Contributed by ARM Ltd.
+
+   This file is part of the GNU opcodes library.
+
+   This library is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   It is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; see the file COPYING3.  If not,
+   see .  */
+
+/* Array of system registers and their associated arch features.
+
+   This file is also used by GCC.  Where necessary, any updates should
+   be made in Binutils and the updated file copied across to GCC, such
+   that the two projects are kept in sync at all times.
+
+   Before using #include to read this file, define a macro:
+
+ SYSREG (name, encoding, flags, features)
+
+  The NAME is the system register name, as recognized by the
+  assembler.  ENCODING provides the necessary information for the binary
+  encoding of the system register.  The FLAGS field is a bitmask of
+  relevant behavior information pertaining to the particular register.
+  For example: is it read/write-only? does it alias another register?
+  The FEATURES field maps onto ISA flags and specifies the architectural
+  feature requirements of the system register.  */
+
+  SYSREG ("accdata_el1",   CPENC (3,0,13,0,5), 0,  
AARCH64_NO_FEATURES)
+  SYSREG ("actlr_el1", CPENC (3,0,1,0,1),  0,  
AARCH64_NO_FEATURES)
+  SYSREG ("actlr_el2", CPENC (3,4,1,0,1),  0,  
AARCH64_NO_FEATURES)
+  SYSREG ("actlr_el3", CPENC (3,6,1,0,1),  0,  
AARCH64_NO_FEATURES)
+  SYSREG ("afsr0_el1", CPENC (3,0,5,1,0),  0,  
AARCH64_NO_FEATURES)
+  SYSREG ("afsr0_el12",CPENC (3,5,5,1,0),  F_ARCHEXT,  
AARCH64_FEATURE (V8_1A))
+  SYSREG ("afsr0_el2", CPENC (3,4,5,1,0),  0,  
AARCH64_NO_FEATURES)
+  SYSREG ("afsr0_el3", CPENC (3,6,5,1,0),  0,  
AARCH64_NO_FEATURES)
+  SYSREG ("afsr1_el1", CPENC (3,0,5,1,1),  0,  
AARCH64_NO_FEATURES)
+  SYSREG ("afsr1_el12",CPENC (3,5,5,1,1),  F_ARCHEXT,  
AARCH64_FEATURE (V8_1A))
+  SYSREG ("afsr1_el2", CPENC (3,4,5,1,1),  0,  
AARCH64_NO_FEATURES)
+  SYSREG ("afsr1_el3", CPENC (3,6,5,1,1),  0,  

[PATCH V2 4/7] aarch64: Add basic target_print_operand support for CONST_STRING

2023-10-18 Thread Victor Do Nascimento
Motivated by the need to print system register names in output
assembly, this patch adds the required logic to
`aarch64_print_operand' to accept rtxs of type CONST_STRING and
process these accordingly.

Consequently, an rtx such as:

  (set (reg/i:DI 0 x0)
 (unspec:DI [(const_string ("s3_3_c13_c2_2"))])

can now be output correctly using the following output pattern when
composing `define_insn's:

  "mrs\t%x0, %1"

gcc/ChangeLog

* gcc/config/aarch64/aarch64.cc (aarch64_print_operand): Add
support for CONST_STRING.
---
 gcc/config/aarch64/aarch64.cc | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 816c4b69fc8..d187e171beb 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -12430,6 +12430,12 @@ aarch64_print_operand (FILE *f, rtx x, int code)
 
   switch (GET_CODE (x))
{
+   case CONST_STRING:
+ {
+   const char *output_op = XSTR (x, 0);
+   asm_fprintf (f, "%s", output_op);
+   break;
+ }
case REG:
  if (aarch64_sve_data_mode_p (GET_MODE (x)))
{
-- 
2.41.0



[PATCH V2 0/7] aarch64: Add support for __arm_rsr and __arm_wsr ACLE function family

2023-10-18 Thread Victor Do Nascimento
This revision of the patch series addresses the following key pieces
of upstream feedback:

  * `aarch64-sys-regs.def', being identical in content to the file with
  the same name in Binutils, now retains the copyright header from
  Binutils.
  * We migrate away from the binary search handling of system-register
  lookups in favour of a hashmap approach, relaxing the requirement
  that all entries in `aarch64-sys-reg.def' be kept in alphabetical
  order.
  * A static selftest is added for sanity-checking of the contents of
  `aarch64-sys-regs.def'.  Given the move to a hashmap lookup mechanism,
  no testing is needed for the preservation of alphabetical order, but
  a test is added to detect spurious duplicate register definitions.

---

This patch series adds support for reading and writing to and from
system registers via the relevant ACLE-defined builtins [1].

The patch series makes a series of additions to the aarch64-specific
areas of the compiler to make this possible.

Firstly, a mechanism for defining system registers is established via a
new .def file and the new SYSREG macro.  This macro is the same as is
used in Binutils and system register entries are compatible with
either code-base.

Given the information contained in this system register definition
file, a compile-time validation mechanism is implemented, such that any
system register name passed as a string literal argument to these
builtins can be checked against known system registers and its use
for a given target architecture validated.

Finally, patterns for each of these builtins are added to the back-end
such that, if all validation criteria are met, the correct assembly is
emitted.

Thus, the following example of system register access is now valid for
GCC:

long long old = __arm_rsr("trcseqstr");
__arm_wsr("trcseqstr", new);

Testing:
 - Bootstrap/regtest on aarch64-linux-gnu done.

[1] https://arm-software.github.io/acle/main/acle.html

Victor Do Nascimento (7):
  aarch64: Sync system register information with Binutils
  aarch64: Add support for aarch64-sys-regs.def
  aarch64: Implement system register validation tools
  aarch64: Add basic target_print_operand support for CONST_STRING
  aarch64: Implement system register r/w arm ACLE intrinsic functions
  aarch64: Add front-end argument type checking for target builtins
  aarch64: Add system register duplication check selftest

 gcc/config/aarch64/aarch64-builtins.cc|  233 
 gcc/config/aarch64/aarch64-c.cc   |4 +-
 gcc/config/aarch64/aarch64-protos.h   |5 +
 gcc/config/aarch64/aarch64-sys-regs.def   | 1064 +
 gcc/config/aarch64/aarch64.cc |  243 
 gcc/config/aarch64/aarch64.h  |   36 +
 gcc/config/aarch64/aarch64.md |   17 +
 gcc/config/aarch64/arm_acle.h |   30 +
 gcc/config/aarch64/predicates.md  |4 +
 .../gcc.target/aarch64/acle/rwsr-1.c  |   20 +
 .../gcc.target/aarch64/acle/rwsr-2.c  |   15 +
 gcc/testsuite/gcc.target/aarch64/acle/rwsr.c  |  144 +++
 12 files changed, 1813 insertions(+), 2 deletions(-)
 create mode 100644 gcc/config/aarch64/aarch64-sys-regs.def
 create mode 100644 gcc/testsuite/gcc.target/aarch64/acle/rwsr-1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/acle/rwsr-2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/acle/rwsr.c

-- 
2.41.0



[PATCH V2 5/7] aarch64: Implement system register r/w arm ACLE intrinsic functions

2023-10-18 Thread Victor Do Nascimento
Implement the aarch64 intrinsics for reading and writing system
registers with the following signatures:

uint32_t __arm_rsr(const char *special_register);
uint64_t __arm_rsr64(const char *special_register);
void* __arm_rsrp(const char *special_register);
float __arm_rsrf(const char *special_register);
double __arm_rsrf64(const char *special_register);
void __arm_wsr(const char *special_register, uint32_t value);
void __arm_wsr64(const char *special_register, uint64_t value);
void __arm_wsrp(const char *special_register, const void *value);
void __arm_wsrf(const char *special_register, float value);
void __arm_wsrf64(const char *special_register, double value);

gcc/ChangeLog:

* gcc/config/aarch64/aarch64-builtins.cc (enum aarch64_builtins):
Add enums for new builtins.
(aarch64_init_rwsr_builtins): New.
(aarch64_general_init_builtins): Call aarch64_init_rwsr_builtins.
(aarch64_expand_rwsr_builtin):  New.
(aarch64_general_expand_builtin): Call aarch64_general_expand_builtin.
* gcc/config/aarch64/aarch64.md (read_sysregdi): New insn_and_split.
(write_sysregdi): Likewise.
* gcc/config/aarch64/arm_acle.h (__arm_rsr): New.
(__arm_rsrp): Likewise.
(__arm_rsr64): Likewise.
(__arm_rsrf): Likewise.
(__arm_rsrf64): Likewise.
(__arm_wsr): Likewise.
(__arm_wsrp): Likewise.
(__arm_wsr64): Likewise.
(__arm_wsrf): Likewise.
(__arm_wsrf64): Likewise.

gcc/testsuite/ChangeLog:

* gcc/testsuite/gcc.target/aarch64/acle/rwsr.c: New.
* gcc/testsuite/gcc.target/aarch64/acle/rwsr-1.c: Likewise.
---
 gcc/config/aarch64/aarch64-builtins.cc| 200 ++
 gcc/config/aarch64/aarch64.md |  17 ++
 gcc/config/aarch64/arm_acle.h |  30 +++
 .../gcc.target/aarch64/acle/rwsr-1.c  |  20 ++
 gcc/testsuite/gcc.target/aarch64/acle/rwsr.c  | 144 +
 5 files changed, 411 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/acle/rwsr-1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/acle/rwsr.c

diff --git a/gcc/config/aarch64/aarch64-builtins.cc 
b/gcc/config/aarch64/aarch64-builtins.cc
index 04f59fd9a54..d8bb2a989a5 100644
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -808,6 +808,17 @@ enum aarch64_builtins
   AARCH64_RBIT,
   AARCH64_RBITL,
   AARCH64_RBITLL,
+  /* System register builtins.  */
+  AARCH64_RSR,
+  AARCH64_RSRP,
+  AARCH64_RSR64,
+  AARCH64_RSRF,
+  AARCH64_RSRF64,
+  AARCH64_WSR,
+  AARCH64_WSRP,
+  AARCH64_WSR64,
+  AARCH64_WSRF,
+  AARCH64_WSRF64,
   AARCH64_BUILTIN_MAX
 };
 
@@ -1798,6 +1809,65 @@ aarch64_init_rng_builtins (void)
   AARCH64_BUILTIN_RNG_RNDRRS);
 }
 
+/* Add builtins for reading system register.  */
+static void
+aarch64_init_rwsr_builtins (void)
+{
+  tree fntype = NULL;
+  tree const_char_ptr_type
+= build_pointer_type (build_type_variant (char_type_node, true, false));
+
+#define AARCH64_INIT_RWSR_BUILTINS_DECL(F, N, T) \
+  aarch64_builtin_decls[AARCH64_##F] \
+= aarch64_general_add_builtin ("__builtin_aarch64_"#N, T, AARCH64_##F);
+
+  fntype
+= build_function_type_list (uint32_type_node, const_char_ptr_type, NULL);
+  AARCH64_INIT_RWSR_BUILTINS_DECL (RSR, rsr, fntype);
+
+  fntype
+= build_function_type_list (ptr_type_node, const_char_ptr_type, NULL);
+  AARCH64_INIT_RWSR_BUILTINS_DECL (RSRP, rsrp, fntype);
+
+  fntype
+= build_function_type_list (uint64_type_node, const_char_ptr_type, NULL);
+  AARCH64_INIT_RWSR_BUILTINS_DECL (RSR64, rsr64, fntype);
+
+  fntype
+= build_function_type_list (float_type_node, const_char_ptr_type, NULL);
+  AARCH64_INIT_RWSR_BUILTINS_DECL (RSRF, rsrf, fntype);
+
+  fntype
+= build_function_type_list (double_type_node, const_char_ptr_type, NULL);
+  AARCH64_INIT_RWSR_BUILTINS_DECL (RSRF64, rsrf64, fntype);
+
+  fntype
+= build_function_type_list (void_type_node, const_char_ptr_type,
+   uint32_type_node, NULL);
+
+  AARCH64_INIT_RWSR_BUILTINS_DECL (WSR, wsr, fntype);
+
+  fntype
+= build_function_type_list (void_type_node, const_char_ptr_type,
+   const_ptr_type_node, NULL);
+  AARCH64_INIT_RWSR_BUILTINS_DECL (WSRP, wsrp, fntype);
+
+  fntype
+= build_function_type_list (void_type_node, const_char_ptr_type,
+   uint64_type_node, NULL);
+  AARCH64_INIT_RWSR_BUILTINS_DECL (WSR64, wsr64, fntype);
+
+  fntype
+= build_function_type_list (void_type_node, const_char_ptr_type,
+   float_type_node, NULL);
+  AARCH64_INIT_RWSR_BUILTINS_DECL (WSRF, wsrf, fntype);
+
+  fntype
+= build_function_type_list (void_type_node, const_char_ptr_type,
+   double_type_node, NULL);
+  

Re: [V3][PATCH 1/3] Provide counted_by attribute to flexible array member field (PR108896)

2023-10-18 Thread Qing Zhao


>>> +   member FIELD_DECL is a valid field of the containing structure's 
>>> fieldlist,
>>> +   FIELDLIST, Report error and remove this attribute when it's not.  */
>>> +static void
>>> +verify_counted_by_attribute (tree fieldlist, tree field_decl)
>>> +{
>>> +  tree attr_counted_by = lookup_attribute ("counted_by",
>>> +   DECL_ATTRIBUTES (field_decl));
>>> +
>>> +  if (!attr_counted_by)
>>> +return;
>>> +
>>> +  /* If there is an counted_by attribute attached to the field,
>>> + verify it.  */
>>> +
>>> +  const char *fieldname
>>> += IDENTIFIER_POINTER (TREE_VALUE (TREE_VALUE (attr_counted_by)));
>>> +
>>> +  /* Verify the argument of the attrbute is a valid field of the
>> s/attrbute/attribute/
>>> + containing structure.  */
>>> +
>>> +  tree counted_by_field = get_named_field (fieldlist, fieldname);
>>> +
>>> +  /* Error when the field is not found in the containing structure.  */
>>> +  if (!counted_by_field)
>>> +{
>>> +  error_at (DECL_SOURCE_LOCATION (field_decl),
>>> +"%qE attribute argument not a field declaration"
>>> +" in the same structure, ignore it",
>>> +(get_attribute_name (attr_counted_by)));
>> Probably someone with English as a first language would make a better 
>> suggestion, but how about:
>>   Argument specified in %qE attribute is not a field declaration in the
>>   same structure, ignoring it.
>>> +
>>> +  DECL_ATTRIBUTES (field_decl)
>>> += remove_attribute ("counted_by", DECL_ATTRIBUTES (field_decl));
>>> +}
>>> +  else
>>> +  /* Error when the field is not with an integer type.  */
>> Suggest: Flag an error when the field is not of an integer type.
>>> +{
>>> +  while (TREE_CHAIN (counted_by_field))
>>> +counted_by_field = TREE_CHAIN (counted_by_field);
>>> +  tree real_field = TREE_VALUE (counted_by_field);
>>> +
>>> +  if (TREE_CODE (TREE_TYPE (real_field)) != INTEGER_TYPE)
>>> +{
>>> +  error_at (DECL_SOURCE_LOCATION (field_decl),
>>> + "%qE attribute argument not a field declaration"
>>> + " with integer type, ignore it",
>>> + (get_attribute_name (attr_counted_by)));
>> Suggest:
>>   Argument specified in %qE attribute is not of an integer type,
>>   ignoring it.
>>> +
>>> +  DECL_ATTRIBUTES (field_decl)
>>> += remove_attribute ("counted_by", DECL_ATTRIBUTES (field_decl));
>>> +}
>>> +}
>>> +
>>> +  return;
> 
> I forgot to mention the redundant return here.

Could you please clarify a little bit here, why the return here is redundant? 
> 
>>> +}
>>>   /* Fill in the fields of a RECORD_TYPE or UNION_TYPE node, T.



Re: [PATCH v2] swap: Fix incorrect lane extraction by vec_extract() [PR106770]

2023-10-18 Thread David Edelsohn
[Resending from correct email.]

Hi, Surya

Thanks for working on this issue and creating a patch.

It helps if you explicitly send patches to Segher and me, and copy
gcc-patches.

+/* Return true if insn is a non-permuting load/store.  */
+static bool
+non_permuting_mem_insn (swap_web_entry *insn_entry, unsigned int i)
+{
+  return (insn_entry[i].special_handling == SH_NOSWAP_LD ||
+  insn_entry[i].special_handling == SH_NOSWAP_ST);
+}

The logical operator || should be at the beginning of the line, not
the end.  That also ensures correct formatting and indentation.  The
|| should be under the "i" of insn on the line above.

It also generally simplifies review to have the ChangeLog in the same
order as changes in the file.  The order of the files relative to the
patch may not be the same, but the ChangeLog entries should be in
sequential order relative to the file.

This patch is okay with the logical operator formatting change.

Thanks, David


Re: [PATCH v2] swap: Fix incorrect lane extraction by vec_extract() [PR106770]

2023-10-18 Thread David Edelsohn
Hi, Surya

Thanks for working on this issue and creating a patch.

It helps if you explicitly send patches to Segher and me, and copy gcc-patches.

+/* Return true if insn is a non-permuting load/store.  */
+static bool
+non_permuting_mem_insn (swap_web_entry *insn_entry, unsigned int i)
+{
+  return (insn_entry[i].special_handling == SH_NOSWAP_LD ||
+  insn_entry[i].special_handling == SH_NOSWAP_ST);
+}

The logical operator || should be at the beginning of the line, not
the end.  That also ensures correct formatting and indentation.  The
|| should be under the "i" of insn on the line above.

It also generally simplifies review to have the ChangeLog in the same
order as changes in the file.  The order of the files relative to the
patch may not be the same, but the ChangeLog entries should be in
sequential order relative to the file.

This patch is okay with the logical operator formatting change.

Thanks, David


Re: [V3][PATCH 1/3] Provide counted_by attribute to flexible array member field (PR108896)

2023-10-18 Thread Qing Zhao
Hi, Sid, 

Thanks a lot for your time and effort to review this patch set!
And sorry for my late reply due to a long vacation immediately after Cauldron, 
just came back this Monday..

See my reply embedded below:

> On Oct 5, 2023, at 2:51 PM, Siddhesh Poyarekar  wrote:
> 
> On 2023-08-25 11:24, Qing Zhao wrote:
>> Provide a new counted_by attribute to flexible array member field.
> 
> The obligatory "I can't ack the patch but here's a review" disclaimer :)
> 
>> 'counted_by (COUNT)'
>>  The 'counted_by' attribute may be attached to the flexible array
>>  member of a structure.  It indicates that the number of the
>>  elements of the array is given by the field named "COUNT" in the
>>  same structure as the flexible array member.  GCC uses this
>>  information to improve the results of the array bound sanitizer and
>>  the '__builtin_dynamic_object_size'.
>>  For instance, the following code:
>>   struct P {
>> size_t count;
>> char other;
>> char array[] __attribute__ ((counted_by (count)));
>>   } *p;
>>  specifies that the 'array' is a flexible array member whose number
>>  of elements is given by the field 'count' in the same structure.
>>  The field that represents the number of the elements should have an
>>  integer type.  An explicit 'counted_by' annotation defines a
>>  relationship between two objects, 'p->array' and 'p->count', that
>>  'p->array' has _at least_ 'p->count' number of elements available.
>>  This relationship must hold even after any of these related objects
>>  are updated.  It's the user's responsibility to make sure this
>>  relationship to be kept all the time.  Otherwise the results of the
>>  array bound sanitizer and the '__builtin_dynamic_object_size' might
>>  be incorrect.
>>  For instance, in the following example, the allocated array has
>>  less elements than what's specified by the 'sbuf->count', this is
>>  an user error.  As a result, out-of-bounds access to the array
>>  might not be detected.
>>   #define SIZE_BUMP 10
>>   struct P *sbuf;
>>   void alloc_buf (size_t nelems)
>>   {
>> sbuf = (struct P *) malloc (MAX (sizeof (struct P),
>>(offsetof (struct P, array[0])
>> + nelems * sizeof (char;
>> sbuf->count = nelems + SIZE_BUMP;
>> /* This is invalid when the sbuf->array has less than sbuf->count
>>elements.  */
>>   }
>>  In the following example, the 2nd update to the field 'sbuf->count'
>>  of the above structure will permit out-of-bounds access to the
>>  array 'sbuf>array' as well.
>>   #define SIZE_BUMP 10
>>   struct P *sbuf;
>>   void alloc_buf (size_t nelems)
>>   {
>> sbuf = (struct P *) malloc (MAX (sizeof (struct P),
>>(offsetof (struct P, array[0])
>> + (nelems + SIZE_BUMP) * sizeof 
>> (char;
>> sbuf->count = nelems;
>> /* This is valid when the sbuf->array has at least sbuf->count
>>elements.  */
>>   }
>>   void use_buf (int index)
>>   {
>> sbuf->count = sbuf->count + SIZE_BUMP + 1;
>> /* Now the value of sbuf->count is larger than the number
>>of elements of sbuf->array.  */
>> sbuf->array[index] = 0;
>> /* then the out-of-bound access to this array
>>might not be detected.  */
>>   }
>> gcc/c-family/ChangeLog:
>>  PR C/108896
>>  * c-attribs.cc (handle_counted_by_attribute): New function.
>>  (attribute_takes_identifier_p): Add counted_by attribute to the list.
>>  * c-common.cc (c_flexible_array_member_type_p): ...To this.
>>  * c-common.h (c_flexible_array_member_type_p): New prototype.
>> gcc/c/ChangeLog:
>>  PR C/108896
>>  * c-decl.cc (flexible_array_member_type_p): Renamed and moved to...
>>  (add_flexible_array_elts_to_size): Use renamed function.
>>  (is_flexible_array_member_p): Use renamed function.
>>  (verify_counted_by_attribute): New function.
>>  (finish_struct): Use renamed function and verify counted_by
>>  attribute.
>> gcc/ChangeLog:
>>  PR C/108896
>>  * doc/extend.texi: Document attribute counted_by.
>>  * tree.cc (get_named_field): New function.
>>  * tree.h (get_named_field): New prototype.
>> gcc/testsuite/ChangeLog:
>>  PR C/108896
>>  * gcc.dg/flex-array-counted-by.c: New test.
>> ---
>>  gcc/c-family/c-attribs.cc| 54 -
>>  gcc/c-family/c-common.cc | 13 
>>  gcc/c-family/c-common.h  |  1 +
>>  gcc/c/c-decl.cc  | 79 

[PATCH6/8] omp: Reorder call for TARGET_SIMD_CLONE_ADJUST (was Re: [PATCH7/8] vect: Add TARGET_SIMD_CLONE_ADJUST_RET_OR_PARAM)

2023-10-18 Thread Andre Vieira (lists)
This patch moves the call to TARGET_SIMD_CLONE_ADJUST until after the 
arguments and return types have been transformed into vector types.  It 
also constructs the adjuments and retval modifications after this call, 
allowing targets to alter the types of the arguments and return of the 
clone prior to the modifications to the function definition.


Is this OK?

gcc/ChangeLog:

* omp-simd-clone.cc (simd_clone_adjust_return_type): Hoist out
code to create return array and don't return new type.
(simd_clone_adjust_argument_types): Hoist out code that creates
ipa_param_body_adjustments and don't return them.
(simd_clone_adjust): Call TARGET_SIMD_CLONE_ADJUST after return
and argument types have been vectorized, create adjustments and
return array after the hook.
(expand_simd_clones): Call TARGET_SIMD_CLONE_ADJUST after return
and argument types have been vectorized.

On 04/10/2023 13:40, Andre Vieira (lists) wrote:



On 04/10/2023 11:41, Richard Biener wrote:

On Wed, 4 Oct 2023, Andre Vieira (lists) wrote:




On 30/08/2023 14:04, Richard Biener wrote:

On Wed, 30 Aug 2023, Andre Vieira (lists) wrote:

This patch adds a new target hook to enable us to adapt the types 
of return
and parameters of simd clones.  We use this in two ways, the first 
one is

to
make sure we can create valid SVE types, including the SVE type 
attribute,
when creating a SVE simd clone, even when the target options do not 
support
SVE.  We are following the same behaviour seen with x86 that 
creates simd
clones according to the ABI rules when no simdlen is provided, even 
if that
simdlen is not supported by the current target options.  Note that 
this

doesn't mean the simd clone will be used in auto-vectorization.


You are not documenting the bool parameter of the new hook.

What's wrong with doing the adjustment in TARGET_SIMD_CLONE_ADJUST?


simd_clone_adjust_argument_types is called after that hook, so by the 
time we
call TARGET_SIMD_CLONE_ADJUST the types are still in scalar, not 
vector.  The

same is true for the return type one.

Also the changes to the types need to be taken into consideration in
'adjustments' I think.


Nothing in the three existing implementations of TARGET_SIMD_CLONE_ADJUST
relies on this ordering I think, how about moving the hook invocation
after simd_clone_adjust_argument_types?



But that wouldn't change the 'ipa_param_body_adjustments' for when we 
have a function definition and we need to redo the body.

Richard.

PS: I hope the subject line survived, my email client is having a bit 
of a

wobble this morning... it's what you get for updating software :(diff --git a/gcc/omp-simd-clone.cc b/gcc/omp-simd-clone.cc
index 
ef0b9b48c7212900023bc0eaebca5e1f9389db77..fb80888190c88e29895ecfbbe1b17d390c9a9dfe
 100644
--- a/gcc/omp-simd-clone.cc
+++ b/gcc/omp-simd-clone.cc
@@ -701,10 +701,9 @@ simd_clone_create (struct cgraph_node *old_node, bool 
force_local)
 }
 
 /* Adjust the return type of the given function to its appropriate
-   vector counterpart.  Returns a simd array to be used throughout the
-   function as a return value.  */
+   vector counterpart.  */
 
-static tree
+static void
 simd_clone_adjust_return_type (struct cgraph_node *node)
 {
   tree fndecl = node->decl;
@@ -714,7 +713,7 @@ simd_clone_adjust_return_type (struct cgraph_node *node)
 
   /* Adjust the function return type.  */
   if (orig_rettype == void_type_node)
-return NULL_TREE;
+return;
   t = TREE_TYPE (TREE_TYPE (fndecl));
   if (INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
 veclen = node->simdclone->vecsize_int;
@@ -737,24 +736,6 @@ simd_clone_adjust_return_type (struct cgraph_node *node)
veclen));
 }
   TREE_TYPE (TREE_TYPE (fndecl)) = t;
-  if (!node->definition)
-return NULL_TREE;
-
-  t = DECL_RESULT (fndecl);
-  /* Adjust the DECL_RESULT.  */
-  gcc_assert (TREE_TYPE (t) != void_type_node);
-  TREE_TYPE (t) = TREE_TYPE (TREE_TYPE (fndecl));
-  relayout_decl (t);
-
-  tree atype = build_array_type_nelts (orig_rettype,
-  node->simdclone->simdlen);
-  if (maybe_ne (veclen, node->simdclone->simdlen))
-return build1 (VIEW_CONVERT_EXPR, atype, t);
-
-  /* Set up a SIMD array to use as the return value.  */
-  tree retval = create_tmp_var_raw (atype, "retval");
-  gimple_add_tmp_var (retval);
-  return retval;
 }
 
 /* Each vector argument has a corresponding array to be used locally
@@ -788,7 +769,7 @@ create_tmp_simd_array (const char *prefix, tree type, 
poly_uint64 simdlen)
declarations will be remapped.  New arguments which are not to be remapped
are marked with USER_FLAG.  */
 
-static ipa_param_body_adjustments *
+static void
 simd_clone_adjust_argument_types (struct cgraph_node *node)
 {
   auto_vec args;
@@ -798,15 +779,11 @@ simd_clone_adjust_argument_types (struct cgraph_node 
*node)
   else
 

Re: [PATCH 8/8] aarch64: Add SVE support for simd clones [PR 96342]

2023-10-18 Thread Andre Vieira (lists)

Rebased, no major changes, still needs review.

On 30/08/2023 10:19, Andre Vieira (lists) via Gcc-patches wrote:
This patch finalizes adding support for the generation of SVE simd 
clones when no simdlen is provided, following the ABI rules where the 
widest data type determines the minimum amount of elements in a length 
agnostic vector.


gcc/ChangeLog:

     * config/aarch64/aarch64-protos.h (add_sve_type_attribute): 
Declare.
 * config/aarch64/aarch64-sve-builtins.cc (add_sve_type_attribute): 
Make

 visibility global.
 * config/aarch64/aarch64.cc (aarch64_fntype_abi): Ensure SVE ABI is
 chosen over SIMD ABI if a SVE type is used in return or arguments.
 (aarch64_simd_clone_compute_vecsize_and_simdlen): Create VLA simd 
clone

 when no simdlen is provided, according to ABI rules.
 (aarch64_simd_clone_adjust): Add '+sve' attribute to SVE simd clones.
 (aarch64_simd_clone_adjust_ret_or_param): New.
 (TARGET_SIMD_CLONE_ADJUST_RET_OR_PARAM): Define.
 * omp-simd-clone.cc (simd_clone_mangle): Print 'x' for VLA simdlen.
 (simd_clone_adjust): Adapt safelen check to be compatible with VLA
 simdlen.

gcc/testsuite/ChangeLog:

 * c-c++-common/gomp/declare-variant-14.c: Adapt aarch64 scan.
 * gfortran.dg/gomp/declare-variant-14.f90: Likewise.
 * gcc.target/aarch64/declare-simd-1.c: Remove warning checks where no
 longer necessary.
 * gcc.target/aarch64/declare-simd-2.c: Add SVE clone scan.diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index 
60a55f4bc1956786ea687fc7cad7ec9e4a84e1f0..769d637f63724a7f0044f48f3dd683e0fb46049c
 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -1005,6 +1005,8 @@ namespace aarch64_sve {
 #ifdef GCC_TARGET_H
   bool verify_type_context (location_t, type_context_kind, const_tree, bool);
 #endif
+ void add_sve_type_attribute (tree, unsigned int, unsigned int,
+ const char *, const char *);
 }
 
 extern void aarch64_split_combinev16qi (rtx operands[3]);
diff --git a/gcc/config/aarch64/aarch64-sve-builtins.cc 
b/gcc/config/aarch64/aarch64-sve-builtins.cc
index 
161a14edde7c9fb1b13b146cf50463e2d78db264..6f99c438d10daa91b7e3b623c995489f1a8a0f4c
 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins.cc
@@ -569,14 +569,16 @@ static bool reported_missing_registers_p;
 /* Record that TYPE is an ABI-defined SVE type that contains NUM_ZR SVE vectors
and NUM_PR SVE predicates.  MANGLED_NAME, if nonnull, is the ABI-defined
mangling of the type.  ACLE_NAME is the  name of the type.  */
-static void
+void
 add_sve_type_attribute (tree type, unsigned int num_zr, unsigned int num_pr,
const char *mangled_name, const char *acle_name)
 {
   tree mangled_name_tree
 = (mangled_name ? get_identifier (mangled_name) : NULL_TREE);
+  tree acle_name_tree
+= (acle_name ? get_identifier (acle_name) : NULL_TREE);
 
-  tree value = tree_cons (NULL_TREE, get_identifier (acle_name), NULL_TREE);
+  tree value = tree_cons (NULL_TREE, acle_name_tree, NULL_TREE);
   value = tree_cons (NULL_TREE, mangled_name_tree, value);
   value = tree_cons (NULL_TREE, size_int (num_pr), value);
   value = tree_cons (NULL_TREE, size_int (num_zr), value);
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
37507f091c2a6154fa944c3a9fad6a655ab5d5a1..cb0947b18c6a611d55579b5b08d93f6a4a9c3b2c
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -4080,13 +4080,13 @@ aarch64_takes_arguments_in_sve_regs_p (const_tree 
fntype)
 static const predefined_function_abi &
 aarch64_fntype_abi (const_tree fntype)
 {
-  if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
-return aarch64_simd_abi ();
-
   if (aarch64_returns_value_in_sve_regs_p (fntype)
   || aarch64_takes_arguments_in_sve_regs_p (fntype))
 return aarch64_sve_abi ();
 
+  if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
+return aarch64_simd_abi ();
+
   return default_function_abi;
 }
 
@@ -27467,7 +27467,7 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct 
cgraph_node *node,
int num, bool explicit_p)
 {
   tree t, ret_type;
-  unsigned int nds_elt_bits;
+  unsigned int nds_elt_bits, wds_elt_bits;
   int count;
   unsigned HOST_WIDE_INT const_simdlen;
 
@@ -27513,10 +27513,14 @@ aarch64_simd_clone_compute_vecsize_and_simdlen 
(struct cgraph_node *node,
   if (TREE_CODE (ret_type) != VOID_TYPE)
 {
   nds_elt_bits = lane_size (SIMD_CLONE_ARG_TYPE_VECTOR, ret_type);
+  wds_elt_bits = nds_elt_bits;
   vec_elts.safe_push (std::make_pair (ret_type, nds_elt_bits));
 }
   else
-nds_elt_bits = POINTER_SIZE;
+{
+  nds_elt_bits = POINTER_SIZE;
+  wds_elt_bits = 0;
+}
 
   int i;
   tree type_arg_types = TYPE_ARG_TYPES (TREE_TYPE 

Re: [PATCH 4/8] vect: don't allow fully masked loops with non-masked simd clones [PR 110485]

2023-10-18 Thread Andre Vieira (lists)
Rebased on top of trunk, minor change to check if loop_vinfo since we 
now do some slp vectorization for simd_clones.


I assume the previous OK still holds.

On 30/08/2023 13:54, Richard Biener wrote:

On Wed, 30 Aug 2023, Andre Vieira (lists) wrote:


When analyzing a loop and choosing a simdclone to use it is possible to choose
a simdclone that cannot be used 'inbranch' for a loop that can use partial
vectors.  This may lead to the vectorizer deciding to use partial vectors
which are not supported for notinbranch simd clones. This patch fixes that by
disabling the use of partial vectors once a notinbranch simd clone has been
selected.


OK.


gcc/ChangeLog:

PR tree-optimization/110485
* tree-vect-stmts.cc (vectorizable_simd_clone_call): Disable partial
vectors usage if a notinbranch simdclone has been selected.

gcc/testsuite/ChangeLog:

* gcc.dg/gomp/pr110485.c: New test.

diff --git a/gcc/testsuite/gcc.dg/gomp/pr110485.c 
b/gcc/testsuite/gcc.dg/gomp/pr110485.c
new file mode 100644
index 
..ba6817a127f40246071e32ccebf692cc4d121d15
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/gomp/pr110485.c
@@ -0,0 +1,19 @@
+/* PR 110485 */
+/* { dg-do compile } */
+/* { dg-additional-options "-Ofast -fdump-tree-vect-details" } */
+/* { dg-additional-options "-march=znver4 --param=vect-partial-vector-usage=1" 
{ target x86_64-*-* } } */
+#pragma omp declare simd notinbranch uniform(p)
+extern double __attribute__ ((const)) bar (double a, double p);
+
+double a[1024];
+double b[1024];
+
+void foo (int n)
+{
+  #pragma omp simd
+  for (int i = 0; i < n; ++i)
+a[i] = bar (b[i], 71.2);
+}
+
+/* { dg-final { scan-tree-dump-not "MASK_LOAD" "vect" } } */
+/* { dg-final { scan-tree-dump "can't use a fully-masked loop because a 
non-masked simd clone was selected." "vect" { target x86_64-*-* } } } */
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 
a9156975d64c7a335ffd27614e87f9d11b23d1ba..731acc76350cae39c899a866584068cff247183a
 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -4539,6 +4539,17 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
   ? boolean_true_node : boolean_false_node;
simd_clone_info.safe_push (sll);
  }
+
+  if (!bestn->simdclone->inbranch && loop_vinfo)
+   {
+ if (dump_enabled_p ()
+ && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
+   dump_printf_loc (MSG_NOTE, vect_location,
+"can't use a fully-masked loop because a"
+" non-masked simd clone was selected.\n");
+ LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+   }
+
   STMT_VINFO_TYPE (stmt_info) = call_simd_clone_vec_info_type;
   DUMP_VECT_SCOPE ("vectorizable_simd_clone_call");
 /*  vect_model_simple_cost (vinfo, stmt_info, ncopies,


[PATCH 0/8] omp: Replace simd_clone_subparts with TYPE_VECTOR_SUBPARTS

2023-10-18 Thread Andre Vieira (lists)


Refactor simd clone handling code ahead of support for poly simdlen.

gcc/ChangeLog:

* omp-simd-clone.cc (simd_clone_subparts): Remove.
(simd_clone_init_simd_arrays): Replace simd_clone_supbarts with
TYPE_VECTOR_SUBPARTS.
(ipa_simd_modify_function_body): Likewise.
* tree-vect-stmts.cc (vectorizable_simd_clone_call): Likewise.
(simd_clone_subparts): Remove.
diff --git a/gcc/omp-simd-clone.cc b/gcc/omp-simd-clone.cc
index 
c1cb7cc8a5c770940bc2032f824e084b37e96dbe..a42643400ddcf10961633448b49d4caafb999f12
 100644
--- a/gcc/omp-simd-clone.cc
+++ b/gcc/omp-simd-clone.cc
@@ -255,16 +255,6 @@ ok_for_auto_simd_clone (struct cgraph_node *node)
   return true;
 }
 
-
-/* Return the number of elements in vector type VECTYPE, which is associated
-   with a SIMD clone.  At present these always have a constant length.  */
-
-static unsigned HOST_WIDE_INT
-simd_clone_subparts (tree vectype)
-{
-  return TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
-}
-
 /* Allocate a fresh `simd_clone' and return it.  NARGS is the number
of arguments to reserve space for.  */
 
@@ -1028,7 +1018,7 @@ simd_clone_init_simd_arrays (struct cgraph_node *node,
}
  continue;
}
-  if (known_eq (simd_clone_subparts (TREE_TYPE (arg)),
+  if (known_eq (TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg)),
node->simdclone->simdlen))
{
  tree ptype = build_pointer_type (TREE_TYPE (TREE_TYPE (array)));
@@ -1040,7 +1030,7 @@ simd_clone_init_simd_arrays (struct cgraph_node *node,
}
   else
{
- unsigned int simdlen = simd_clone_subparts (TREE_TYPE (arg));
+ poly_uint64 simdlen = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg));
  unsigned int times = vector_unroll_factor (node->simdclone->simdlen,
 simdlen);
  tree ptype = build_pointer_type (TREE_TYPE (TREE_TYPE (array)));
@@ -1226,9 +1216,9 @@ ipa_simd_modify_function_body (struct cgraph_node *node,
  iter, NULL_TREE, NULL_TREE);
   adjustments->register_replacement (&(*adjustments->m_adj_params)[j], r);
 
-  if (multiple_p (node->simdclone->simdlen, simd_clone_subparts (vectype)))
+  if (multiple_p (node->simdclone->simdlen, TYPE_VECTOR_SUBPARTS 
(vectype)))
j += vector_unroll_factor (node->simdclone->simdlen,
-  simd_clone_subparts (vectype)) - 1;
+  TYPE_VECTOR_SUBPARTS (vectype)) - 1;
 }
   adjustments->sort_replacements ();
 
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 
9bb43e98f56d18929c9c02227954fdf38eafefd8..a9156975d64c7a335ffd27614e87f9d11b23d1ba
 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -4126,16 +4126,6 @@ vect_simd_lane_linear (tree op, class loop *loop,
 }
 }
 
-/* Return the number of elements in vector type VECTYPE, which is associated
-   with a SIMD clone.  At present these vectors always have a constant
-   length.  */
-
-static unsigned HOST_WIDE_INT
-simd_clone_subparts (tree vectype)
-{
-  return TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
-}
-
 /* Function vectorizable_simd_clone_call.
 
Check if STMT_INFO performs a function call that can be vectorized
@@ -4429,7 +4419,7 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
slp_node);
  if (arginfo[i].vectype == NULL
  || !constant_multiple_p (bestn->simdclone->simdlen,
-  simd_clone_subparts 
(arginfo[i].vectype)))
+  TYPE_VECTOR_SUBPARTS 
(arginfo[i].vectype)))
return false;
}
 
@@ -,10 +4434,11 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
 
   if (bestn->simdclone->args[i].arg_type == SIMD_CLONE_ARG_TYPE_MASK)
{
+ tree clone_arg_vectype = bestn->simdclone->args[i].vector_type;
  if (bestn->simdclone->mask_mode == VOIDmode)
{
- if (simd_clone_subparts (bestn->simdclone->args[i].vector_type)
- != simd_clone_subparts (arginfo[i].vectype))
+ if (maybe_ne (TYPE_VECTOR_SUBPARTS (clone_arg_vectype),
+   TYPE_VECTOR_SUBPARTS (arginfo[i].vectype)))
{
  /* FORNOW we only have partial support for vector-type masks
 that can't hold all of simdlen. */
@@ -4464,7 +4455,7 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
  if (!SCALAR_INT_MODE_P (TYPE_MODE (arginfo[i].vectype))
  || maybe_ne (exact_div (bestn->simdclone->simdlen,
  num_mask_args),
-  simd_clone_subparts (arginfo[i].vectype)))
+  TYPE_VECTOR_SUBPARTS (arginfo[i].vectype)))

Re: [PATCH 5/8] vect: Use inbranch simdclones in masked loops

2023-10-18 Thread Andre Vieira (lists)

Rebased, needs review.

On 30/08/2023 10:13, Andre Vieira (lists) via Gcc-patches wrote:
This patch enables the compiler to use inbranch simdclones when 
generating masked loops in autovectorization.


gcc/ChangeLog:

 * omp-simd-clone.cc (simd_clone_adjust_argument_types): Make function
 compatible with mask parameters in clone.
 * tree-vect-stmts.cc (vect_convert): New helper function.
 (vect_build_all_ones_mask): Allow vector boolean typed masks.
 (vectorizable_simd_clone_call): Enable the use of masked clones in
 fully masked loops.diff --git a/gcc/omp-simd-clone.cc b/gcc/omp-simd-clone.cc
index 
a42643400ddcf10961633448b49d4caafb999f12..ef0b9b48c7212900023bc0eaebca5e1f9389db77
 100644
--- a/gcc/omp-simd-clone.cc
+++ b/gcc/omp-simd-clone.cc
@@ -807,8 +807,14 @@ simd_clone_adjust_argument_types (struct cgraph_node *node)
 {
   ipa_adjusted_param adj;
   memset (, 0, sizeof (adj));
-  tree parm = args[i];
-  tree parm_type = node->definition ? TREE_TYPE (parm) : parm;
+  tree parm = NULL_TREE;
+  tree parm_type = NULL_TREE;
+  if(i < args.length())
+   {
+ parm = args[i];
+ parm_type = node->definition ? TREE_TYPE (parm) : parm;
+   }
+
   adj.base_index = i;
   adj.prev_clone_index = i;
 
@@ -1547,7 +1553,7 @@ simd_clone_adjust (struct cgraph_node *node)
  mask = gimple_assign_lhs (g);
  g = gimple_build_assign (make_ssa_name (TREE_TYPE (mask)),
   BIT_AND_EXPR, mask,
-  build_int_cst (TREE_TYPE (mask), 1));
+  build_one_cst (TREE_TYPE (mask)));
  gsi_insert_after (, g, GSI_CONTINUE_LINKING);
  mask = gimple_assign_lhs (g);
}
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 
731acc76350cae39c899a866584068cff247183a..6e2c70c1d3970af652c1e50e41b144162884bf24
 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -1594,6 +1594,20 @@ check_load_store_for_partial_vectors (loop_vec_info 
loop_vinfo, tree vectype,
 }
 }
 
+/* Return SSA name of the result of the conversion of OPERAND into type TYPE.
+   The conversion statement is inserted at GSI.  */
+
+static tree
+vect_convert (vec_info *vinfo, stmt_vec_info stmt_info, tree type, tree 
operand,
+ gimple_stmt_iterator *gsi)
+{
+  operand = build1 (VIEW_CONVERT_EXPR, type, operand);
+  gassign *new_stmt = gimple_build_assign (make_ssa_name (type),
+  operand);
+  vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
+  return gimple_get_lhs (new_stmt);
+}
+
 /* Return the mask input to a masked load or store.  VEC_MASK is the vectorized
form of the scalar mask condition and LOOP_MASK, if nonnull, is the mask
that needs to be applied to all loads and stores in a vectorized loop.
@@ -2547,7 +2561,8 @@ vect_build_all_ones_mask (vec_info *vinfo,
 {
   if (TREE_CODE (masktype) == INTEGER_TYPE)
 return build_int_cst (masktype, -1);
-  else if (TREE_CODE (TREE_TYPE (masktype)) == INTEGER_TYPE)
+  else if (VECTOR_BOOLEAN_TYPE_P (masktype)
+  || TREE_CODE (TREE_TYPE (masktype)) == INTEGER_TYPE)
 {
   tree mask = build_int_cst (TREE_TYPE (masktype), -1);
   mask = build_vector_from_val (masktype, mask);
@@ -4156,7 +4171,7 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
   size_t i, nargs;
   tree lhs, rtype, ratype;
   vec *ret_ctor_elts = NULL;
-  int arg_offset = 0;
+  int masked_call_offset = 0;
 
   /* Is STMT a vectorizable call?   */
   gcall *stmt = dyn_cast  (stmt_info->stmt);
@@ -4171,7 +4186,7 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
   gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
   fndecl = TREE_OPERAND (fndecl, 0);
   gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
-  arg_offset = 1;
+  masked_call_offset = 1;
 }
   if (fndecl == NULL_TREE)
 return false;
@@ -4199,7 +4214,7 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
 return false;
 
   /* Process function arguments.  */
-  nargs = gimple_call_num_args (stmt) - arg_offset;
+  nargs = gimple_call_num_args (stmt) - masked_call_offset;
 
   /* Bail out if the function has zero arguments.  */
   if (nargs == 0)
@@ -4221,7 +4236,7 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
   thisarginfo.op = NULL_TREE;
   thisarginfo.simd_lane_linear = false;
 
-  int op_no = i + arg_offset;
+  int op_no = i + masked_call_offset;
   if (slp_node)
op_no = vect_slp_child_index_for_operand (stmt, op_no);
   if (!vect_is_simple_use (vinfo, stmt_info, slp_node,
@@ -4303,16 +4318,6 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
   arginfo.quick_push (thisarginfo);
 }
 
-  if (loop_vinfo
-  && !LOOP_VINFO_VECT_FACTOR 

Re: [Patch 3/8] vect: Fix vect_get_smallest_scalar_type for simd clones

2023-10-18 Thread Andre Vieira (lists)

Made it a local function and changed prototype according to comments.

Is this OK?

 gcc/ChangeLog:
* tree-vect-data-refs.cc (vect_get_smallest_scalar_type): Special
case
simd clone calls and only use types that are mapped to vectors.
(simd_clone_call_p): New helper function.

On 30/08/2023 13:54, Richard Biener wrote:

On Wed, 30 Aug 2023, Andre Vieira (lists) wrote:


The vect_get_smallest_scalar_type helper function was using any argument to a
simd clone call when trying to determine the smallest scalar type that would
be vectorized.  This included the function pointer type in a MASK_CALL for
instance, and would result in the wrong type being selected.  Instead this
patch special cases simd_clone_call's and uses only scalar types of the
original function that get transformed into vector types.


Looks sensible.

+bool
+simd_clone_call_p (gimple *stmt, cgraph_node **out_node)

you could return the cgraph_node * or NULL here.  Are you going to
use the function elsewhere?  Otherwise put it in the same TU as
the only use please and avoid exporting it.

Richard.


gcc/ChangeLog:

* tree-vect-data-refs.cci (vect_get_smallest_scalar_type): Special
case
simd clone calls and only use types that are mapped to vectors.
* tree-vect-stmts.cc (simd_clone_call_p): New helper function.
* tree-vectorizer.h (simd_clone_call_p): Declare new function.

gcc/testsuite/ChangeLog:

* gcc.dg/vect/vect-simd-clone-16f.c: Remove unnecessary differentation
between targets with different pointer sizes.
* gcc.dg/vect/vect-simd-clone-17f.c: Likewise.
* gcc.dg/vect/vect-simd-clone-18f.c: Likewise.

diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-16f.c 
b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-16f.c
index 
574698d3e133ecb8700e698fa42a6b05dd6b8a18..7cd29e894d0502a59fadfe67db2db383133022d3
 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-16f.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-16f.c
@@ -7,9 +7,8 @@
 #include "vect-simd-clone-16.c"
 
 /* Ensure the the in-branch simd clones are used on targets that support them.
-   Some targets use pairs of vectors and do twice the calls.  */
-/* { dg-final { scan-tree-dump-times {[\n\r] [^\n]* = foo\.simdclone} 2 "vect" 
{ target { ! { { i?86-*-* x86_64-*-* } && { ! lp64 } } } } } } */
-/* { dg-final { scan-tree-dump-times {[\n\r] [^\n]* = foo\.simdclone} 4 "vect" 
{ target { { i?86*-*-* x86_64-*-* } && { ! lp64 } } } } } */
+ */
+/* { dg-final { scan-tree-dump-times {[\n\r] [^\n]* = foo\.simdclone} 2 "vect" 
} } */
 
 /* The LTO test produces two dump files and we scan the wrong one.  */
 /* { dg-skip-if "" { *-*-* } { "-flto" } { "" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-17f.c 
b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-17f.c
index 
8bb6d19301a67a3eebce522daaf7d54d88f708d7..177521dc44531479fca1f1a1a0f2010f30fa3fb5
 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-17f.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-17f.c
@@ -7,9 +7,8 @@
 #include "vect-simd-clone-17.c"
 
 /* Ensure the the in-branch simd clones are used on targets that support them.
-   Some targets use pairs of vectors and do twice the calls.  */
-/* { dg-final { scan-tree-dump-times {[\n\r] [^\n]* = foo\.simdclone} 2 "vect" 
{ target { ! { { i?86-*-* x86_64-*-* } && { ! lp64 } } } } } } */
-/* { dg-final { scan-tree-dump-times {[\n\r] [^\n]* = foo\.simdclone} 4 "vect" 
{ target { { i?86*-*-* x86_64-*-* } && { ! lp64 } } } } } */
+ */
+/* { dg-final { scan-tree-dump-times {[\n\r] [^\n]* = foo\.simdclone} 2 "vect" 
} } */
 
 /* The LTO test produces two dump files and we scan the wrong one.  */
 /* { dg-skip-if "" { *-*-* } { "-flto" } { "" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-18f.c 
b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-18f.c
index 
d34f23f4db8e9c237558cc22fe66b7e02b9e6c20..4dd51381d73c0c7c8ec812f24e5054df038059c5
 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-18f.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-18f.c
@@ -7,9 +7,8 @@
 #include "vect-simd-clone-18.c"
 
 /* Ensure the the in-branch simd clones are used on targets that support them.
-   Some targets use pairs of vectors and do twice the calls.  */
-/* { dg-final { scan-tree-dump-times {[\n\r] [^\n]* = foo\.simdclone} 2 "vect" 
{ target { ! { { i?86-*-* x86_64-*-* } && { ! lp64 } } } } } } */
-/* { dg-final { scan-tree-dump-times {[\n\r] [^\n]* = foo\.simdclone} 4 "vect" 
{ target { { i?86*-*-* x86_64-*-* } && { ! lp64 } } } } } */
+ */
+/* { dg-final { scan-tree-dump-times {[\n\r] [^\n]* = foo\.simdclone} 2 "vect" 
} } */
 
 /* The LTO test produces two dump files and we scan the wrong one.  */
 /* { dg-skip-if "" { *-*-* } { "-flto" } { "" } } */
diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
index 
40ab568fe355964b878d770010aa9eeaef63eeac..106d46e68910df94e806433e1cd841894a86d062
 100644
--- a/gcc/tree-vect-data-refs.cc
+++ 

Re: [Patch 2/8] parloops: Allow poly nit and bound

2023-10-18 Thread Andre Vieira (lists)

Posting the changed patch for completion, already reviewed.

On 30/08/2023 13:32, Richard Biener wrote:

On Wed, 30 Aug 2023, Andre Vieira (lists) wrote:


Teach parloops how to handle a poly nit and bound e ahead of the changes to
enable non-constant simdlen.


Can you use poly_int_tree_p to combine INTEGER_CST || POLY_INT_CST please?

OK with that change.


gcc/ChangeLog:

* tree-parloops.cc (try_to_transform_to_exit_first_loop_alt): Accept
poly NIT and ALT_BOUND.

diff --git a/gcc/tree-parloops.cc b/gcc/tree-parloops.cc
index 
a35f3d5023b06e5ef96eb4222488fcb34dd7bd45..80f3dd6dce281e1eb1d76d38bd09e6638a875142
 100644
--- a/gcc/tree-parloops.cc
+++ b/gcc/tree-parloops.cc
@@ -2531,14 +2531,15 @@ try_transform_to_exit_first_loop_alt (class loop *loop,
   tree nit_type = TREE_TYPE (nit);
 
   /* Figure out whether nit + 1 overflows.  */
-  if (TREE_CODE (nit) == INTEGER_CST)
+  if (poly_int_tree_p (nit))
 {
   if (!tree_int_cst_equal (nit, TYPE_MAX_VALUE (nit_type)))
{
  alt_bound = fold_build2_loc (UNKNOWN_LOCATION, PLUS_EXPR, nit_type,
   nit, build_one_cst (nit_type));
 
- gcc_assert (TREE_CODE (alt_bound) == INTEGER_CST);
+ gcc_assert (TREE_CODE (alt_bound) == INTEGER_CST
+ || TREE_CODE (alt_bound) == POLY_INT_CST);
  transform_to_exit_first_loop_alt (loop, reduction_list, alt_bound);
  return true;
}


Re: [PATCH 1/8] parloops: Copy target and optimizations when creating a function clone

2023-10-18 Thread Andre Vieira (lists)

Just posting a rebase for completion.

On 30/08/2023 13:31, Richard Biener wrote:

On Wed, 30 Aug 2023, Andre Vieira (lists) wrote:



SVE simd clones require to be compiled with a SVE target enabled or the
argument types will not be created properly. To achieve this we need to copy
DECL_FUNCTION_SPECIFIC_TARGET from the original function declaration to the
clones.  I decided it was probably also a good idea to copy
DECL_FUNCTION_SPECIFIC_OPTIMIZATION in case the original function is meant to
be compiled with specific optimization options.


OK.


gcc/ChangeLog:

* tree-parloops.cc (create_loop_fn): Copy specific target and
optimization options to clone.

diff --git a/gcc/tree-parloops.cc b/gcc/tree-parloops.cc
index 
e495bbd65270bdf90bae2c4a2b52777522352a77..a35f3d5023b06e5ef96eb4222488fcb34dd7bd45
 100644
--- a/gcc/tree-parloops.cc
+++ b/gcc/tree-parloops.cc
@@ -2203,6 +2203,11 @@ create_loop_fn (location_t loc)
   DECL_CONTEXT (t) = decl;
   TREE_USED (t) = 1;
   DECL_ARGUMENTS (decl) = t;
+  DECL_FUNCTION_SPECIFIC_TARGET (decl)
+= DECL_FUNCTION_SPECIFIC_TARGET (act_cfun->decl);
+  DECL_FUNCTION_SPECIFIC_OPTIMIZATION (decl)
+= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (act_cfun->decl);
+
 
   allocate_struct_function (decl, false);
 


Re: aarch64, vect, omp: Add SVE support for simd clones [PR 96342]

2023-10-18 Thread Andre Vieira (lists)

Hi,

I noticed I had missed one of the preparatory patches at the start of 
this series (first one) added now, also removed the 'vect: Add 
vector_mode paramater to simd_clone_usable' since after review we no 
longer deemed it necessary. And replaced the old vect: Add 
TARGET_SIMD_CLONE_ADJUST_RET_OR_PARAM with omp: Reorder call for 
TARGET_SIMD_CLONE_ADJUST after comments.


Bootstrapped and regression tested the series on 
aarch64-unknown-linux-gnu and x86_64-pc-linux-gnu.



Andre Vieira (8):

omp: Replace simd_clone_supbarts with TYPE_VECTOR_SUBPARTS [NEW]
parloops: Copy target and optimizations when creating a function clone 
[Reviewed]
parloops: Allow poly nit and bound [Cond Reviewed, made the requested 
changes]
vect: Fix vect_get_smallest_scalar_type for simd clones [First Reviewe, 
made the requested changes, OK?]
vect: don't allow fully masked loops with non-masked simd clones [PR 
110485] [Reviewed]

vect: Use inbranch simdclones in masked loops [Needs review]
vect: omp: Reorder call for TARGET_SIMD_CLONE_ADJUST [NEW]
aarch64: Add SVE support for simd clones [PR 96342] [Needs review]

PS: apologies for the inconsistent numbering of the emails, things got a 
bit confusing with removing and adding patches to the series.


On 30/08/2023 09:49, Andre Vieira (lists) via Gcc-patches wrote:

Hi,

This patch series aims to implement support for SVE simd clones when not 
specifying a 'simdlen' clause for AArch64. This patch depends on my 
earlier patch: '[PATCH] aarch64: enable mixed-types for aarch64 
simdclones'.


Bootstrapped and regression tested the series on 
aarch64-unknown-linux-gnu and x86_64-pc-linux-gnu. I also tried building 
the patches separately, but that was before some further clean-up 
restructuring, so will do that again prior to pushing.


Andre Vieira (8):

parloops: Copy target and optimizations when creating a function clone
parloops: Allow poly nit and bound
vect: Fix vect_get_smallest_scalar_type for simd clones
vect: don't allow fully masked loops with non-masked simd clones [PR 
110485]

vect: Use inbranch simdclones in masked loops
vect: Add vector_mode paramater to simd_clone_usable
vect: Add TARGET_SIMD_CLONE_ADJUST_RET_OR_PARAM
aarch64: Add SVE support for simd clones [PR 96342]


RE: [x86 PATCH] PR target/110551: Fix reg allocation for widening multiplications.

2023-10-18 Thread Roger Sayle


Many thanks to Tobias Burnus for pointing out the mistake/typo in the PR
number.
This fix is for PR 110551, not PR 110511.  I'll update the ChangeLog and
filename
of the new testcase, if approved.

Sorry for any inconvenience/confusion.
Cheers,
Roger
--

> -Original Message-
> From: Roger Sayle 
> Sent: 17 October 2023 20:06
> To: 'gcc-patches@gcc.gnu.org' 
> Cc: 'Uros Bizjak' 
> Subject: [x86 PATCH] PR target/110511: Fix reg allocation for widening
> multiplications.
> 
> 
> This patch contains clean-ups of the widening multiplication patterns in
i386.md,
> and provides variants of the existing highpart multiplication
> peephole2 transformations (that tidy up register allocation after reload),
and
> thereby fixes PR target/110511, which is a superfluous move instruction.
> 
> For the new test case, compiled on x86_64 with -O2.
> 
> Before:
> mulx64: movabsq $-7046029254386353131, %rcx
> movq%rcx, %rax
> mulq%rdi
> xorq%rdx, %rax
> ret
> 
> After:
> mulx64: movabsq $-7046029254386353131, %rax
> mulq%rdi
> xorq%rdx, %rax
> ret
> 
> The clean-ups are (i) that operand 1 is consistently made register_operand
and
> operand 2 becomes nonimmediate_operand, so that predicates match the
> constraints, (ii) the representation of the BMI2 mulx instruction is
updated to use
> the new umul_highpart RTX, and (iii) because operands
> 0 and 1 have different modes in widening multiplications, "a" is a more
> appropriate constraint than "0" (which avoids spills/reloads containing
SUBREGs).
> The new peephole2 transformations are based upon those at around line 9951
of
> i386.md, that begins with the comment ;; Highpart multiplication
peephole2s to
> tweak register allocation.
> ;; mov imm,%rdx; mov %rdi,%rax; imulq %rdx  ->  mov imm,%rax; imulq %rdi
> 
> 
> This patch has been tested on x86_64-pc-linux-gnu with make bootstrap and
> make -k check, both with and without --target_board=unix{-m32} with no new
> failures.  Ok for mainline?
> 
> 
> 2023-10-17  Roger Sayle  
> 
> gcc/ChangeLog
> PR target/110511
> * config/i386/i386.md (mul3): Make operands 1 and
> 2 take "regiser_operand" and "nonimmediate_operand" respectively.
> (mulqihi3): Likewise.
> (*bmi2_umul3_1): Operand 2 needs to be register_operand
> matching the %d constraint.  Use umul_highpart RTX to represent
> the highpart multiplication.
> (*umul3_1):  Operand 2 should use regiser_operand
> predicate, and "a" rather than "0" as operands 0 and 2 have
> different modes.
> (define_split): For mul to mulx conversion, use the new
> umul_highpart RTX representation.
> (*mul3_1):  Operand 1 should be register_operand
> and the constraint %a as operands 0 and 1 have different modes.
> (*mulqihi3_1): Operand 1 should be register_operand matching
> the constraint %0.
> (define_peephole2): Providing widening multiplication variants
> of the peephole2s that tweak highpart multiplication register
> allocation.
> 
> gcc/testsuite/ChangeLog
> PR target/110511
> * gcc.target/i386/pr110511.c: New test case.
> 
> 
> Thanks in advance,
> Roger




[PATCH] vect: Allow same precision for bit-precision conversions.

2023-10-18 Thread Robin Dapp
Hi,

even though there was no full conclusion yet I took the liberty of
just posting this as a patch in case of further discussion.

In PR/111794 we miss a vectorization because on riscv type precision and
mode precision differ for mask types.  We can still vectorize when
allowing assignments with the same precision for dest and source which
is what this patch does.

Bootstrapped and regtested on x86, aarch64 and power10.  No new failures
on riscv.

Regards
 Robin

gcc/ChangeLog:

PR/111794

* tree-vect-stmts.cc (vectorizable_assignment): Add
same-precision exception for dest and source.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/slp-mask-1.c: New test.
* gcc.target/riscv/rvv/autovec/slp-mask-run-1.c: New test.
---
 .../gcc.target/riscv/rvv/autovec/slp-mask-1.c | 18 +++
 .../riscv/rvv/autovec/slp-mask-run-1.c| 31 +++
 gcc/tree-vect-stmts.cc| 12 ---
 3 files changed, 56 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/slp-mask-1.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/slp-mask-run-1.c

diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/slp-mask-1.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/slp-mask-1.c
new file mode 100644
index 000..ee1baa58d63
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/slp-mask-1.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-std=gnu99 -O3 -march=rv64gcv -mabi=lp64d 
--param=riscv-autovec-preference=scalable -fdump-tree-slp-details" } */
+
+void
+__attribute__ ((noipa))
+f (int *restrict x, short *restrict y, int *restrict res)
+{
+  res[0] = x[0] == 1 & y[0] == 2;
+  res[1] = x[1] == 1 & y[1] == 2;
+  res[2] = x[2] == 1 & y[2] == 2;
+  res[3] = x[3] == 1 & y[3] == 2;
+  res[4] = x[4] == 1 & y[4] == 2;
+  res[5] = x[5] == 1 & y[5] == 2;
+  res[6] = x[6] == 1 & y[6] == 2;
+  res[7] = x[7] == 1 & y[7] == 2;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "slp2" } 
} */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/slp-mask-run-1.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/slp-mask-run-1.c
new file mode 100644
index 000..b3469c41c87
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/slp-mask-run-1.c
@@ -0,0 +1,31 @@
+/* { dg-do run { target { riscv_v } } } */
+/* { dg-additional-options "-std=gnu99 -O3 -march=rv64gcv -mabi=lp64d 
--param=riscv-autovec-preference=scalable" } */
+
+#include 
+#include 
+
+#include "slp-mask-1.c"
+
+#define SZ 8
+
+__attribute__ ((optimize ("1")))
+int main ()
+{
+  int *a = malloc (SZ * sizeof (*a));
+  short *b = malloc (SZ * sizeof (*b));
+  int *res = malloc (SZ * sizeof (*res));
+  int *ref = malloc (SZ * sizeof (*ref));
+
+  for (int i = 0; i < SZ; i++)
+{
+  a[i] = i & 1;
+  b[i] = 2;
+  ref[i] = a[i] == 1 & b[i] == 2;
+}
+
+  f (a, b, res);
+
+  for (int i = 0; i < SZ; i++)
+if (res[i] != ref[i])
+  __builtin_abort ();
+}
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index cd7c1090d88..e612da6c492 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -6084,14 +6084,16 @@ vectorizable_assignment (vec_info *vinfo,
   /* But a conversion that does not change the bit-pattern is ok.  */
   && !(INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
   && INTEGRAL_TYPE_P (TREE_TYPE (op))
-  && (TYPE_PRECISION (TREE_TYPE (scalar_dest))
+  && (((TYPE_PRECISION (TREE_TYPE (scalar_dest))
   > TYPE_PRECISION (TREE_TYPE (op)))
-  && TYPE_UNSIGNED (TREE_TYPE (op
+&& TYPE_UNSIGNED (TREE_TYPE (op)))
+  || (TYPE_PRECISION (TREE_TYPE (scalar_dest))
+  == TYPE_PRECISION (TREE_TYPE (op))
 {
   if (dump_enabled_p ())
-dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "type conversion to/from bit-precision "
- "unsupported.\n");
+   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+"type conversion to/from bit-precision "
+"unsupported.\n");
   return false;
 }
 
-- 
2.41.0



Re: [PATCH v3 1/2] c++: Initial support for P0847R7 (Deducing This) [PR102609]

2023-10-18 Thread Jason Merrill

On 10/18/23 07:46, waffl3x wrote:

Any progress on this, or do I need to coax the process along?  :)


Yeah, I've been working on it since the copyright assignment process
has finished, originally I was going to note that on my next update
which I had hoped to finish today or tomorrow. Well, in truth I was
hoping to send one the same day that copyright assignment finished, but
I found a nasty bug so I spent all day adding test cases for all the
relevant overloadable operators. Currently, it crashes when calling a
subscript operator declared with an explicit object parameter in a
dependent context. I haven't looked into the fix yet, but I plan to.

Also, before I forget, what is the process for confirming my copyright
assignment on your end? Do you just need to check in with the FSF to
see if it went through? Let me know if there's anything you need from
me regarding that.

Aside from the bug that's currently present in the first patch, I only
have like 1 or 2 little things I want to change about it. I will make
those few changes to patch 1, finish patch 2 (diagnostics) which will
also include test cases for the new bug I found. After I am done that I
plan on adding the things that are missing, because I suspect that
looking into that will get me close to finding the fix for the crash.


Hmm, is it? I see that clang thinks so, but I don't know where they get
that idea from. The grammar certainly allows it:


attribute-specifier-seqopt decl-specifier-seq declarator = initializer-clause


and I don't see anything else that prohibits it.


You would be right for P0847R7, but CWG DR 2653 changed that. You can
find the updated grammar in dcl.fct section 3 (subsection? I'm not
really sure to be honest.)

I've also included a copy of the grammar here for your convenience.

https://eel.is/c++draft/dcl.fct#nt:parameter-declaration
parameter-declaration:
   attribute-specifier-seqopt thisopt decl-specifier-seq declarator
   attribute-specifier-seqopt decl-specifier-seq declarator = initializer-clause
   attribute-specifier-seqopt thisopt decl-specifier-seq abstract-declaratoropt
   attribute-specifier-seqopt decl-specifier-seq abstract-declaratoropt = 
initializer-clause


Ah, yes, thanks.


I was thinking to set a TREE_LANG_FLAG on the TREE_LIST node.


I did figure this is originally what you meant, and I can still change
it to go this route since I'm sure it's likely just as good. But I do
recall something I didn't like in the implementation that nudged me
towards using the purpose member instead. Either way, not a big deal. I
think I just liked not having to mess with a linked list as I am not
used to them as a data structure, it might have been that simple. :^)


I wouldn't expect to need any actual dealing with the linked list, just 
setting a flag in cp_parameter_declaration_list at the same point as the 
existing PARENTHESIZED_LIST_P flag.


But given CWG2653 as you pointed out, your current approach is fine.


I will try to get something done today, but I was struggling with
writing some of the tests, there's also a lot more of them now. I also
wrote a bunch of musings in comments that I would like feedback on.

My most concrete question is, how exactly should I be testing a
pedwarn, I want to test that I get the correct warning and error with
the separate flags, do I have to create two separate tests for each one?


Yes.  I tend to use letter suffixes for tests that vary only in flags 
(and expected results), e.g. feature1a.C, feature1b.C.



I'm just going to include the little wall I wrote in decl.cc regarding
pedwarn, just in case I can't get this done tonight so I can get some
feedback regarding it. On the other hand, it might just not be very
relevant to this patch in particular as I kind of noted, but maybe
there's some way to do what I was musing about that I've overlooked. It
does end up a bit ranty I guess, hopefully that doesn't make it
confusing.

```
/* I believe we should make a switch for this feature specifically,
I recall seeing discussion regarding enabling newer language
features when set to older standards. I would advocate for a
specific flag for each specific feature. Maybe they should all
be under an override flag? -foverride-dialect=xobj,ifconstexpr (?)
I dont think it makes sense to give each feature override it's own
flag. I don't recall what the consensus was around this discussion
either though.

For the time being it's controlled by pedantic. I am concerned that
tying this to pedantic going forward that one might want to enable
-pedantic-errors while also enabling select features from newer
dialects. It didn't look like this use case is supported to me.

I suppose this will require redesign work to support, so for
the purposes of this patch, emitting a pedwarn seems correct.
I just don't like that it can't be suppressed on an individual
basis.  */
if (xobj_parm && cxx_dialect < cxx23)
   pedwarn(DECL_SOURCE_LOCATION 

Re: Re: [PATCH] RISC-V: Add popcount fallback expander.

2023-10-18 Thread 钟居哲

Could you by the way add this mention this PR: 
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111791
Add the test of this PR ?









juzhe.zh...@rivai.ai



 



From: Robin Dapp



Date: 2023-10-18 21:51



To: juzhe.zh...@rivai.ai; gcc-patches; palmer; kito.cheng; jeffreyalaw



CC: rdapp.gcc



Subject: Re: [PATCH] RISC-V: Add popcount fallback expander.



I didn't push this yet because it would have introduced an UNRESOLVED that



my summary script didn't catch.  Normally I go with just contrib/test_summary



but that only filters out FAIL and XPASS.  I should really be using



compare_testsuite_log.py from riscv-gnu-toolchain/scripts.



 



It was caused by a typo:



 



-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "slp" } } 
*/



+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "slp2" } 
} */



 



Regards



Robin



 




Re: [Backport RFA] lra: Avoid unfolded plus-0

2023-10-18 Thread Vladimir Makarov



On 10/18/23 09:37, Richard Sandiford wrote:

Vlad, is it OK if I backport the patch below to fix
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111528 ?  Jakub has
given a conditional OK on irc.


Ok.  It should be safe.  I don't expect any issues because of this.



Re: [PATCH] RISC-V: Add popcount fallback expander.

2023-10-18 Thread Robin Dapp
I didn't push this yet because it would have introduced an UNRESOLVED that
my summary script didn't catch.  Normally I go with just contrib/test_summary
but that only filters out FAIL and XPASS.  I should really be using
compare_testsuite_log.py from riscv-gnu-toolchain/scripts.

It was caused by a typo:

-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "slp" } } 
*/
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "slp2" } 
} */

Regards
 Robin


[Backport RFA] lra: Avoid unfolded plus-0

2023-10-18 Thread Richard Sandiford
Vlad, is it OK if I backport the patch below to fix
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111528 ?  Jakub has
given a conditional OK on irc.

Thanks,
Richard

Richard Sandiford  writes:
> While backporting another patch to an earlier release, I hit a
> situation in which lra_eliminate_regs_1 would eliminate an address to:
>
> (plus (reg:P R) (const_int 0))
>
> This address compared not-equal to plain:
>
> (reg:P R)
>
> which caused an ICE in a later peephole2.  (The ICE showed up in
> gfortran.fortran-torture/compile/pr80464.f90 on the branch but seems
> to be latent on trunk.)
>
> These unfolded PLUSes shouldn't occur in the insn stream, and later code
> in the same function tried to avoid them.
>
> Tested on aarch64-linux-gnu so far, but I'll test on x86_64-linux-gnu too.
> Does this look OK?
>
> There are probably other instances of the same thing elsewhere,
> but it seemed safer to stick to the one that caused the issue.
>
> Thanks,
> Richard
>
>
> gcc/
>   * lra-eliminations.cc (lra_eliminate_regs_1): Use simplify_gen_binary
>   rather than gen_rtx_PLUS.
> ---
>  gcc/lra-eliminations.cc | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/gcc/lra-eliminations.cc b/gcc/lra-eliminations.cc
> index df613cdda76..4daaff1a124 100644
> --- a/gcc/lra-eliminations.cc
> +++ b/gcc/lra-eliminations.cc
> @@ -406,7 +406,7 @@ lra_eliminate_regs_1 (rtx_insn *insn, rtx x, machine_mode 
> mem_mode,
>   elimination_fp2sp_occured_p = true;
>  
> if (! update_p && ! full_p)
> - return gen_rtx_PLUS (Pmode, to, XEXP (x, 1));
> + return simplify_gen_binary (PLUS, Pmode, to, XEXP (x, 1));
>  
> if (maybe_ne (update_sp_offset, 0))
>   offset = ep->to_rtx == stack_pointer_rtx ? update_sp_offset : 0;


Re: [PATCH V5] VECT: Enhance SLP of MASK_LEN_GATHER_LOAD[PR111721]

2023-10-18 Thread juzhe.zh...@rivai.ai
Hi, this patch fix V4 issue:

Previously as Richard S commented:
https://gcc.gnu.org/pipermail/gcc-patches/2023-October/633178.html 

slp_op and mask_vectype are only initialised when mask_index >= 0.
Shouldn't this code be under mask_index >= 0 too?
Also, when do we encounter mismatched mask_vectypes?  Presumably the SLP
node has a known vectype by this point.  I think a comment would be useful.

Since I didn't encounter mismatched case in the regression of RISC-V and X86, 
so 
I fix it in V4 patch as follows:
+  if (mask_index >= 0 && slp_node)
+   {
+ bool match_p
+   = vect_maybe_update_slp_op_vectype (slp_op, mask_vectype);
+ gcc_assert (match_p);
+   }
Add assertion here.

However, recently an ICE suddenly appear today in RISC-V regression:

FAIL: gcc.dg/tree-ssa/pr44306.c (internal compiler error: in vectorizable_load, 
at tree-vect-stmts.cc:9885)
FAIL: gcc.dg/tree-ssa/pr44306.c (test for excess errors)

This is because we are encountering that mask_vectype is boolean type and it is 
external def.
Then vect_maybe_update_slp_op_vectype will return false.

Then I fix this piece of code in V5 here:

+  if (mask_index >= 0 && slp_node
+ && !vect_maybe_update_slp_op_vectype (slp_op, mask_vectype))
+   {
+ /* We don't vectorize the boolean type external SLP mask.  */
+ if (dump_enabled_p ())
+   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+"incompatible vector types for invariants\n");
+ return false;
+   }

Bootstrap and Regression on x86 passed.

Thanks.


juzhe.zh...@rivai.ai
 
From: Juzhe-Zhong
Date: 2023-10-18 20:36
To: gcc-patches
CC: richard.sandiford; rguenther; Juzhe-Zhong
Subject: [PATCH V5] VECT: Enhance SLP of MASK_LEN_GATHER_LOAD[PR111721]
This patch fixes this following FAILs in RISC-V regression:
 
FAIL: gcc.dg/vect/vect-gather-1.c -flto -ffat-lto-objects  scan-tree-dump vect 
"Loop contains only SLP stmts"
FAIL: gcc.dg/vect/vect-gather-1.c scan-tree-dump vect "Loop contains only SLP 
stmts"
FAIL: gcc.dg/vect/vect-gather-3.c -flto -ffat-lto-objects  scan-tree-dump vect 
"Loop contains only SLP stmts"
FAIL: gcc.dg/vect/vect-gather-3.c scan-tree-dump vect "Loop contains only SLP 
stmts"
 
The root cause of these FAIL is that GCC SLP failed on MASK_LEN_GATHER_LOAD.
 
We have 2 following situations of scalar recognized MASK_LEN_GATHER_LOAD:
 
1. conditional gather load: MASK_LEN_GATHER_LOAD (base, offset, scale, zero, 
condtional mask).
   
   This situation we just need to leverage the current MASK_GATHER_LOAD which 
can achieve SLP MASK_LEN_GATHER_LOAD.
 
2. un-conditional gather load: MASK_LEN_GATHER_LOAD (base, offset, scale, zero, 
-1)
   
   Current SLP check will failed on dummy mask -1, so we relax the check in 
tree-vect-slp.cc and allow it to be materialized.

Consider this following case:
 
void __attribute__((noipa))
f (int *restrict y, int *restrict x, int *restrict indices, int n)
{
  for (int i = 0; i < n; ++i)
{
  y[i * 2] = x[indices[i * 2]] + 1;
  y[i * 2 + 1] = x[indices[i * 2 + 1]] + 2;
}
}
 
https://godbolt.org/z/WG3M3n7Mo
 
GCC unable to SLP using VEC_LOAD_LANES/VEC_STORE_LANES:
 
f:
ble a3,zero,.L5
.L3:
vsetvli a5,a3,e8,mf4,ta,ma
vsetvli zero,a5,e32,m1,ta,ma
vlseg2e32.v v6,(a2)
vsetvli a4,zero,e64,m2,ta,ma
vsext.vf2   v2,v6
vsll.vi v2,v2,2
vsetvli zero,a5,e32,m1,ta,ma
vluxei64.v  v1,(a1),v2
vsetvli a4,zero,e64,m2,ta,ma
vsext.vf2   v2,v7
vsetvli zero,zero,e32,m1,ta,ma
vadd.vi v4,v1,1
vsetvli zero,zero,e64,m2,ta,ma
vsll.vi v2,v2,2
vsetvli zero,a5,e32,m1,ta,ma
vluxei64.v  v2,(a1),v2
vsetvli a4,zero,e32,m1,ta,ma
sllia6,a5,3
vadd.vi v5,v2,2
sub a3,a3,a5
vsetvli zero,a5,e32,m1,ta,ma
vsseg2e32.v v4,(a0)
add a2,a2,a6
add a0,a0,a6
bne a3,zero,.L3
.L5:
ret
 
After this patch:
 
f:
ble a3,zero,.L5
li a5,1
csrr t1,vlenb
slli a5,a5,33
srli a7,t1,2
addi a5,a5,1
slli a3,a3,1
neg t3,a7
vsetvli a4,zero,e64,m1,ta,ma
vmv.v.x v4,a5
.L3:
minu a5,a3,a7
vsetvli zero,a5,e32,m1,ta,ma
vle32.v v1,0(a2)
vsetvli a4,zero,e64,m2,ta,ma
vsext.vf2 v2,v1
vsll.vi v2,v2,2
vsetvli zero,a5,e32,m1,ta,ma
vluxei64.v v2,(a1),v2
vsetvli a4,zero,e32,m1,ta,ma
mv a6,a3
vadd.vv v2,v2,v4
vsetvli zero,a5,e32,m1,ta,ma
vse32.v v2,0(a0)
add a2,a2,t1
add a0,a0,t1
add a3,a3,t3
bgtu a6,a7,.L3
.L5:
ret
 
Note that I found we are missing conditional mask gather_load SLP test, Append 
a test for it in this patch.
 
Tested on RISC-V and Bootstrap && Regression on X86 passed.
 
Ok for trunk ?
 
gcc/ChangeLog:
 
* tree-vect-slp.cc (vect_get_operand_map): Add MASK_LEN_GATHER_LOAD.
(vect_get_and_check_slp_defs): Ditto.
(vect_build_slp_tree_1): Ditto.
(vect_build_slp_tree_2): Ditto.
* tree-vect-stmts.cc (vectorizable_load): 

[PATCH V5] VECT: Enhance SLP of MASK_LEN_GATHER_LOAD[PR111721]

2023-10-18 Thread Juzhe-Zhong
This patch fixes this following FAILs in RISC-V regression:

FAIL: gcc.dg/vect/vect-gather-1.c -flto -ffat-lto-objects  scan-tree-dump vect 
"Loop contains only SLP stmts"
FAIL: gcc.dg/vect/vect-gather-1.c scan-tree-dump vect "Loop contains only SLP 
stmts"
FAIL: gcc.dg/vect/vect-gather-3.c -flto -ffat-lto-objects  scan-tree-dump vect 
"Loop contains only SLP stmts"
FAIL: gcc.dg/vect/vect-gather-3.c scan-tree-dump vect "Loop contains only SLP 
stmts"

The root cause of these FAIL is that GCC SLP failed on MASK_LEN_GATHER_LOAD.

We have 2 following situations of scalar recognized MASK_LEN_GATHER_LOAD:

1. conditional gather load: MASK_LEN_GATHER_LOAD (base, offset, scale, zero, 
condtional mask).
   
   This situation we just need to leverage the current MASK_GATHER_LOAD which 
can achieve SLP MASK_LEN_GATHER_LOAD.

2. un-conditional gather load: MASK_LEN_GATHER_LOAD (base, offset, scale, zero, 
-1)
   
   Current SLP check will failed on dummy mask -1, so we relax the check in 
tree-vect-slp.cc and allow it to be materialized.

Consider this following case:

void __attribute__((noipa))
f (int *restrict y, int *restrict x, int *restrict indices, int n)
{
  for (int i = 0; i < n; ++i)
{
  y[i * 2] = x[indices[i * 2]] + 1;
  y[i * 2 + 1] = x[indices[i * 2 + 1]] + 2;
}
}

https://godbolt.org/z/WG3M3n7Mo

GCC unable to SLP using VEC_LOAD_LANES/VEC_STORE_LANES:

f:
ble a3,zero,.L5
.L3:
vsetvli a5,a3,e8,mf4,ta,ma
vsetvli zero,a5,e32,m1,ta,ma
vlseg2e32.v v6,(a2)
vsetvli a4,zero,e64,m2,ta,ma
vsext.vf2   v2,v6
vsll.vi v2,v2,2
vsetvli zero,a5,e32,m1,ta,ma
vluxei64.v  v1,(a1),v2
vsetvli a4,zero,e64,m2,ta,ma
vsext.vf2   v2,v7
vsetvli zero,zero,e32,m1,ta,ma
vadd.vi v4,v1,1
vsetvli zero,zero,e64,m2,ta,ma
vsll.vi v2,v2,2
vsetvli zero,a5,e32,m1,ta,ma
vluxei64.v  v2,(a1),v2
vsetvli a4,zero,e32,m1,ta,ma
sllia6,a5,3
vadd.vi v5,v2,2
sub a3,a3,a5
vsetvli zero,a5,e32,m1,ta,ma
vsseg2e32.v v4,(a0)
add a2,a2,a6
add a0,a0,a6
bne a3,zero,.L3
.L5:
ret

After this patch:

f:
ble a3,zero,.L5
li  a5,1
csrrt1,vlenb
sllia5,a5,33
srlia7,t1,2
addia5,a5,1
sllia3,a3,1
neg t3,a7
vsetvli a4,zero,e64,m1,ta,ma
vmv.v.x v4,a5
.L3:
minua5,a3,a7
vsetvli zero,a5,e32,m1,ta,ma
vle32.v v1,0(a2)
vsetvli a4,zero,e64,m2,ta,ma
vsext.vf2   v2,v1
vsll.vi v2,v2,2
vsetvli zero,a5,e32,m1,ta,ma
vluxei64.v  v2,(a1),v2
vsetvli a4,zero,e32,m1,ta,ma
mv  a6,a3
vadd.vv v2,v2,v4
vsetvli zero,a5,e32,m1,ta,ma
vse32.v v2,0(a0)
add a2,a2,t1
add a0,a0,t1
add a3,a3,t3
bgtua6,a7,.L3
.L5:
ret

Note that I found we are missing conditional mask gather_load SLP test, Append 
a test for it in this patch.

Tested on RISC-V and Bootstrap && Regression on X86 passed.

Ok for trunk ?

gcc/ChangeLog:

* tree-vect-slp.cc (vect_get_operand_map): Add MASK_LEN_GATHER_LOAD.
(vect_get_and_check_slp_defs): Ditto.
(vect_build_slp_tree_1): Ditto.
(vect_build_slp_tree_2): Ditto.
* tree-vect-stmts.cc (vectorizable_load): Ditto.

gcc/testsuite/ChangeLog:

* gcc.dg/vect/vect-gather-6.c: New test.

---
 gcc/testsuite/gcc.dg/vect/vect-gather-6.c | 15 +++
 gcc/tree-vect-slp.cc  | 22 ++
 gcc/tree-vect-stmts.cc| 12 +++-
 3 files changed, 44 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-gather-6.c

diff --git a/gcc/testsuite/gcc.dg/vect/vect-gather-6.c 
b/gcc/testsuite/gcc.dg/vect/vect-gather-6.c
new file mode 100644
index 000..ff55f321854
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-gather-6.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+
+void
+f (int *restrict y, int *restrict x, int *restrict indices, int *restrict 
cond, int n)
+{
+  for (int i = 0; i < n; ++i)
+{
+  if (cond[i * 2])
+   y[i * 2] = x[indices[i * 2]] + 1;
+  if (cond[i * 2 + 1])
+   y[i * 2 + 1] = x[indices[i * 2 + 1]] + 2;
+}
+}
+
+/* { dg-final { scan-tree-dump "Loop contains only SLP stmts" vect { target 
vect_gather_load_ifn } } } */
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index d081999a763..146dba658a2 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -552,6 +552,7 @@ vect_get_operand_map (const gimple *stmt, unsigned char 
swap = 0)
return arg1_map;
 
  case IFN_MASK_GATHER_LOAD:
+ case IFN_MASK_LEN_GATHER_LOAD:
return arg1_arg4_map;
 
  case IFN_MASK_STORE:
@@ -719,8 +720,7 @@ 

[PATCH] RISC-V: Add popcount fallback expander.

2023-10-18 Thread juzhe.zh...@rivai.ai
LGTM popcount patch.



juzhe.zh...@rivai.ai


Re: [PATCH] Support g++ 4.8 as a host compiler.

2023-10-18 Thread Jakub Jelinek
On Wed, Oct 18, 2023 at 01:33:40PM +0200, Jakub Jelinek wrote:
> Making it guaranteed that it has at least one argument say through
>   template  poly_int(const U &, const T &...) {}
> fixes it for 4.8/4.9 as well.

So, perhaps (but so far totally untested, the other bootstrap is still
running):

2023-10-18  Jakub Jelinek  

* poly-int.h (poly_int::poly_int): Ensure the const Cs &...
argument ctor has at least one argument.

--- gcc/poly-int.h.jj   2023-10-13 19:34:44.112832389 +0200
+++ gcc/poly-int.h  2023-10-18 13:49:29.038751482 +0200
@@ -379,8 +379,8 @@ public:
   template
   poly_int (const poly_int &);
 
-  template
-  constexpr poly_int (const Cs &...);
+  template
+  constexpr poly_int (const C0 &, const Cs &...);
 
   poly_int  = (const poly_int &) = default;
 
@@ -446,11 +446,11 @@ poly_int::poly_int (const poly_int
 }
 
 template
-template
+template
 inline constexpr
-poly_int::poly_int (const Cs &... cs)
+poly_int::poly_int (const C0 , const Cs &... cs)
   : poly_int (typename poly_int_fullness= N>::type (),
- cs...) {}
+ c0, cs...) {}
 
 /* Initialize with c0, cs..., and some trailing zeros.  */
 template


Jakub



RE: [PATCH v1] RISC-V: Remove the type size restriction of vectorizer

2023-10-18 Thread Li, Pan2
Thanks Richard, let's wait for a while incase there are comments from others 
due to not familiar with these parts.

Pan

-Original Message-
From: Richard Biener  
Sent: Wednesday, October 18, 2023 2:34 PM
To: Li, Pan2 
Cc: gcc-patches@gcc.gnu.org; juzhe.zh...@rivai.ai; Wang, Yanzhang 
; kito.ch...@gmail.com; Liu, Hongtao 

Subject: Re: [PATCH v1] RISC-V: Remove the type size restriction of vectorizer

On Wed, Oct 18, 2023 at 3:20 AM  wrote:
>
> From: Pan Li 
>
> The vectoriable_call has one restriction of the size of data type.
> Aka DF to DI is allowed but SF to DI isn't. You may see below message
> when try to vectorize function call like lrintf.
>
> void
> test_lrintf (long *out, float *in, unsigned count)
> {
>   for (unsigned i = 0; i < count; i++)
> out[i] = __builtin_lrintf (in[i]);
> }
>
> lrintf.c:5:26: missed: couldn't vectorize loop
> lrintf.c:5:26: missed: not vectorized: unsupported data-type
>
> Then the standard name pattern like lrintmn2 cannot work for different
> data type size like SF => DI. This patch would like to remove this data
> type size check and unblock the standard name like lrintmn2.
>
> Passed the x86 bootstrap and regression test already.

OK.

On x86 we seem to have lrintsfdi2 but not lrintv4sfv4di2, with SLP
vectorization we could expect to see the following vectorized after
the patch (with loop vectorization you'll see us pre-select same sized
vector types)

long int x[4];
float y[4];

void foo ()
{
  x[0] = __builtin_lrintf (y[0]);
  x[1] = __builtin_lrintf (y[1]);
  x[2] = __builtin_lrintf (y[2]);
  x[3] = __builtin_lrintf (y[3]);
}


> gcc/ChangeLog:
>
> * tree-vect-stmts.cc (vectorizable_call): Remove data size
> check.
>
> Signed-off-by: Pan Li 
> ---
>  gcc/tree-vect-stmts.cc | 13 -
>  1 file changed, 13 deletions(-)
>
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index b3a56498595..326e000a71d 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -3529,19 +3529,6 @@ vectorizable_call (vec_info *vinfo,
>
>return false;
>  }
> -  /* FORNOW: we don't yet support mixtures of vector sizes for calls,
> - just mixtures of nunits.  E.g. DI->SI versions of __builtin_ctz*
> - are traditionally vectorized as two VnDI->VnDI IFN_CTZs followed
> - by a pack of the two vectors into an SI vector.  We would need
> - separate code to handle direct VnDI->VnSI IFN_CTZs.  */
> -  if (TYPE_SIZE (vectype_in) != TYPE_SIZE (vectype_out))
> -{
> -  if (dump_enabled_p ())
> -   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> -"mismatched vector sizes %T and %T\n",
> -vectype_in, vectype_out);
> -  return false;
> -}
>
>if (VECTOR_BOOLEAN_TYPE_P (vectype_out)
>!= VECTOR_BOOLEAN_TYPE_P (vectype_in))
> --
> 2.34.1
>


Re: [PATCH v3 1/2] c++: Initial support for P0847R7 (Deducing This) [PR102609]

2023-10-18 Thread waffl3x
> Any progress on this, or do I need to coax the process along?  :)

Yeah, I've been working on it since the copyright assignment process
has finished, originally I was going to note that on my next update
which I had hoped to finish today or tomorrow. Well, in truth I was
hoping to send one the same day that copyright assignment finished, but
I found a nasty bug so I spent all day adding test cases for all the
relevant overloadable operators. Currently, it crashes when calling a
subscript operator declared with an explicit object parameter in a
dependent context. I haven't looked into the fix yet, but I plan to.

Also, before I forget, what is the process for confirming my copyright
assignment on your end? Do you just need to check in with the FSF to
see if it went through? Let me know if there's anything you need from
me regarding that.

Aside from the bug that's currently present in the first patch, I only
have like 1 or 2 little things I want to change about it. I will make
those few changes to patch 1, finish patch 2 (diagnostics) which will
also include test cases for the new bug I found. After I am done that I
plan on adding the things that are missing, because I suspect that
looking into that will get me close to finding the fix for the crash.

> Hmm, is it? I see that clang thinks so, but I don't know where they get
> that idea from. The grammar certainly allows it:
> 
> > attribute-specifier-seqopt decl-specifier-seq declarator = 
> > initializer-clause
> 
> 
> and I don't see anything else that prohibits it.

You would be right for P0847R7, but CWG DR 2653 changed that. You can
find the updated grammar in dcl.fct section 3 (subsection? I'm not
really sure to be honest.)

I've also included a copy of the grammar here for your convenience.

https://eel.is/c++draft/dcl.fct#nt:parameter-declaration
parameter-declaration:
  attribute-specifier-seqopt thisopt decl-specifier-seq declarator
  attribute-specifier-seqopt decl-specifier-seq declarator = initializer-clause
  attribute-specifier-seqopt thisopt decl-specifier-seq abstract-declaratoropt
  attribute-specifier-seqopt decl-specifier-seq abstract-declaratoropt = 
initializer-clause 


> I was thinking to set a TREE_LANG_FLAG on the TREE_LIST node.

I did figure this is originally what you meant, and I can still change
it to go this route since I'm sure it's likely just as good. But I do
recall something I didn't like in the implementation that nudged me
towards using the purpose member instead. Either way, not a big deal. I
think I just liked not having to mess with a linked list as I am not
used to them as a data structure, it might have been that simple. :^)

I will try to get something done today, but I was struggling with
writing some of the tests, there's also a lot more of them now. I also
wrote a bunch of musings in comments that I would like feedback on.

My most concrete question is, how exactly should I be testing a
pedwarn, I want to test that I get the correct warning and error with
the separate flags, do I have to create two separate tests for each one?

I'm just going to include the little wall I wrote in decl.cc regarding
pedwarn, just in case I can't get this done tonight so I can get some
feedback regarding it. On the other hand, it might just not be very
relevant to this patch in particular as I kind of noted, but maybe
there's some way to do what I was musing about that I've overlooked. It
does end up a bit ranty I guess, hopefully that doesn't make it
confusing.

```
/* I believe we should make a switch for this feature specifically,
   I recall seeing discussion regarding enabling newer language
   features when set to older standards. I would advocate for a
   specific flag for each specific feature. Maybe they should all
   be under an override flag? -foverride-dialect=xobj,ifconstexpr (?)
   I dont think it makes sense to give each feature override it's own
   flag. I don't recall what the consensus was around this discussion
   either though.
   For the time being it's controlled by pedantic. I am concerned that
   tying this to pedantic going forward that one might want to enable
   -pedantic-errors while also enabling select features from newer
   dialects. It didn't look like this use case is supported to me.

   I suppose this will require redesign work to support, so for
   the purposes of this patch, emitting a pedwarn seems correct.
   I just don't like that it can't be suppressed on an individual
   basis.  */
if (xobj_parm && cxx_dialect < cxx23)
  pedwarn(DECL_SOURCE_LOCATION (xobj_parm), OPT_Wpedantic, "");
```

That's all for now, I will try, (but I am very much not promising,) to
have an update by the end of today (6-8 hours for me.) If I manage to
get that out, I will (finally) start moving forward on implementing the
missing and broken features, and the aforementioned bug.

Alex



Re: [PATCH] RISC-V: Add popcount fallback expander.

2023-10-18 Thread Robin Dapp
> I saw you didn't extend VI -> V_VLSI. I guess will failed SLP on popcount.

Added VLS modes and your test in v2.

Testsuite looks unchanged on my side (vect, dg, rvv).

Regards
 Robin

Subject: [PATCH v2] RISC-V: Add popcount fallback expander.

I didn't manage to get back to the generic vectorizer fallback for
popcount so I figured I'd rather create a popcount fallback in the
riscv backend.  It uses the WWG algorithm from libgcc.

gcc/ChangeLog:

* config/riscv/autovec.md (popcount2): New expander.
* config/riscv/riscv-protos.h (expand_popcount): Define.
* config/riscv/riscv-v.cc (expand_popcount): Vectorize popcount
with the WWG algorithm.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/unop/popcount-1.c: New test.
* gcc.target/riscv/rvv/autovec/unop/popcount-2.c: New test.
* gcc.target/riscv/rvv/autovec/unop/popcount-run-1.c: New test.
* gcc.target/riscv/rvv/autovec/unop/popcount.c: New test.
---
 gcc/config/riscv/autovec.md   |   14 +
 gcc/config/riscv/riscv-protos.h   |1 +
 gcc/config/riscv/riscv-v.cc   |   71 +
 .../riscv/rvv/autovec/unop/popcount-1.c   |   20 +
 .../riscv/rvv/autovec/unop/popcount-2.c   |   19 +
 .../riscv/rvv/autovec/unop/popcount-run-1.c   |   49 +
 .../riscv/rvv/autovec/unop/popcount.c | 1464 +
 7 files changed, 1638 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/popcount-1.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/popcount-2.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/popcount-run-1.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/popcount.c

diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
index c5b1e52cbf9..80910ba3cc2 100644
--- a/gcc/config/riscv/autovec.md
+++ b/gcc/config/riscv/autovec.md
@@ -1484,6 +1484,20 @@ (define_expand "xorsign3"
   DONE;
 })
 
+;; 
---
+;; - [INT] POPCOUNT.
+;; 
---
+
+(define_expand "popcount2"
+  [(match_operand:V_VLSI 0 "register_operand")
+   (match_operand:V_VLSI 1 "register_operand")]
+  "TARGET_VECTOR"
+{
+  riscv_vector::expand_popcount (operands);
+  DONE;
+})
+
+
 ;; -
 ;;  [INT] Highpart multiplication
 ;; -
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 49bdcdf2f93..4aeccdd961b 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -515,6 +515,7 @@ void expand_fold_extract_last (rtx *);
 void expand_cond_unop (unsigned, rtx *);
 void expand_cond_binop (unsigned, rtx *);
 void expand_cond_ternop (unsigned, rtx *);
+void expand_popcount (rtx *);
 
 /* Rounding mode bitfield for fixed point VXRM.  */
 enum fixed_point_rounding_mode
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 21d86c3f917..8b594b7127e 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -4152,4 +4152,75 @@ expand_vec_lfloor (rtx op_0, rtx op_1, machine_mode 
vec_fp_mode,
   emit_vec_cvt_x_f (op_0, op_1, UNARY_OP_FRM_RDN, vec_fp_mode);
 }
 
+/* Vectorize popcount by the Wilkes-Wheeler-Gill algorithm that libgcc uses as
+   well.  */
+void
+expand_popcount (rtx *ops)
+{
+  rtx dst = ops[0];
+  rtx src = ops[1];
+  machine_mode mode = GET_MODE (dst);
+  scalar_mode imode = GET_MODE_INNER (mode);
+  static const uint64_t m5 = 0xULL;
+  static const uint64_t m3 = 0xULL;
+  static const uint64_t mf = 0x0F0F0F0F0F0F0F0FULL;
+  static const uint64_t m1 = 0x0101010101010101ULL;
+
+  rtx x1 = gen_reg_rtx (mode);
+  rtx x2 = gen_reg_rtx (mode);
+  rtx x3 = gen_reg_rtx (mode);
+  rtx x4 = gen_reg_rtx (mode);
+
+  /* x1 = src - (src >> 1) & 0x555...);  */
+  rtx shift1 = expand_binop (mode, lshr_optab, src, GEN_INT (1), NULL, true,
+OPTAB_DIRECT);
+
+  rtx and1 = gen_reg_rtx (mode);
+  rtx ops1[] = {and1, shift1, gen_int_mode (m5, imode)};
+  emit_vlmax_insn (code_for_pred_scalar (AND, mode), riscv_vector::BINARY_OP,
+  ops1);
+
+  x1 = expand_binop (mode, sub_optab, src, and1, NULL, true, OPTAB_DIRECT);
+
+  /* x2 = (x1 & 0xULL) + ((x1 >> 2) & 0xULL);
+   */
+  rtx and2 = gen_reg_rtx (mode);
+  rtx ops2[] = {and2, x1, gen_int_mode (m3, imode)};
+  emit_vlmax_insn (code_for_pred_scalar (AND, mode), riscv_vector::BINARY_OP,
+  ops2);
+
+  rtx shift2 = expand_binop (mode, lshr_optab, x1, GEN_INT (2), NULL, true,
+OPTAB_DIRECT);
+
+  rtx and22 = gen_reg_rtx (mode);
+  rtx ops22[] = {and22, shift2, gen_int_mode (m3, imode)};
+  

Re: [PATCH] Support g++ 4.8 as a host compiler.

2023-10-18 Thread Jakub Jelinek
On Wed, Oct 18, 2023 at 11:23:49AM +0100, Richard Sandiford wrote:
> > --- a/gcc/cse.cc
> > +++ b/gcc/cse.cc
> > @@ -4951,8 +4951,14 @@ cse_insn (rtx_insn *insn)
> >   && is_a  (mode, _mode)
> >   && (extend_op = load_extend_op (int_mode)) != UNKNOWN)
> > {
> > +#if GCC_VERSION >= 5000
> >   struct rtx_def memory_extend_buf;
> >   rtx memory_extend_rtx = _extend_buf;
> > +#else
> > + alignas (alignof (rtx_def)) unsigned char
> > +   memory_extended_buf[sizeof (rtx_def)];
> 
> Looks like the simpler "alignas (rtx_def)" should work.

It does.

> LGTM otherwise FWIW.

Here is what I'm bootstrapping/regtesting on gcc112 now (i.e. with 4.8.5
as system compiler), added details what bug we are working around.
The reduced testcase on which I've bisected it is:
struct poly_int {
  poly_int() = default;
  template  poly_int(const T &...) {}
};
union rtunion {
  poly_int rt_subregrt_rtx;
};
struct rtx_def {
  rtunion fld;
};
void cse_insn() { rtx_def memory_extend_buf; }
or even with just
  template  poly_int();
line in there.  Bet gcc 4.8/4.9 was unhappy about the
template variadic ctor accepting empty pack and being like
the default ctor but not defaulted in that case.
Making it guaranteed that it has at least one argument say through
  template  poly_int(const U &, const T &...) {}
fixes it for 4.8/4.9 as well.

2023-10-18  Jakub Jelinek  

PR bootstrap/111852
* cse.cc (cse_insn): Add workaround for GCC 4.8-4.9, instead of
using rtx_def type for memory_extend_buf, use unsigned char
arrayy with size of rtx_def and its alignment.

--- gcc/cse.cc.jj   2023-06-20 08:57:38.339505245 +0200
+++ gcc/cse.cc  2023-10-18 13:20:30.555836778 +0200
@@ -4951,8 +4951,15 @@ cse_insn (rtx_insn *insn)
  && is_a  (mode, _mode)
  && (extend_op = load_extend_op (int_mode)) != UNKNOWN)
{
+#if GCC_VERSION >= 5000
  struct rtx_def memory_extend_buf;
  rtx memory_extend_rtx = _extend_buf;
+#else
+ /* Workaround GCC < 5 bug, fixed in r5-3834 as part of PR63362
+fix.  */
+ alignas (rtx_def) unsigned char memory_extended_buf[sizeof (rtx_def)];
+ rtx memory_extend_rtx = (rtx) _extended_buf[0];
+#endif
 
  /* Set what we are trying to extend and the operation it might
 have been extended with.  */


Jakub



Re: [Patch] OpenMP: Avoid ICE with LTO and 'omp allocate (was: [Patch] Fortran: Support OpenMP's 'allocate' directive for stack vars)

2023-10-18 Thread Jakub Jelinek
On Wed, Oct 18, 2023 at 12:56:01PM +0200, Tobias Burnus wrote:
> On 18.10.23 11:36, Jakub Jelinek wrote:
> > On Wed, Oct 18, 2023 at 11:12:44AM +0200, Thomas Schwinge wrote:
> > >  +FAIL: gfortran.dg/gomp/allocate-13.f90   -O  (internal compiler 
> > > error: tree code 'statement_list' is not supported in LTO streams)
> > Any references to GENERIC code in clauses etc. should have been gimplified
> > or cleared during gimplification, we shouldn't support STATEMENT_LIST
> > in LTO streaming.
> 
> We only needed the statement list as aid during the gimplify.cc handling
> of GOMP_alloc/GOMP_free for Fortran. How about just remove_attribute it
> in that case? As discussed, as DECL_ATTRIBUTE gets added from the left
> to the DECL_CHAIN, there shouldn't be a problem of introducing shared
> trees; note that 'omp allocate' itself is added per DECL, i.e. it does
> not introduce sharing itself, either.
> 
> Tested with x86-64-gnu-linux.
> 
> Tobias
> -
> Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
> München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
> Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
> München, HRB 106955

> OpenMP: Avoid ICE with LTO and 'omp allocate'
> 
> gcc/ChangeLog:
> 
>   * gimplify.cc (gimplify_bind_expr): Remove "omp allocate" attribute
>   to avoid that auxillary statement list reaches to LTO.
> 
> gcc/testsuite/ChangeLog:
> 
>   * gfortran.dg/gomp/allocate-13a.f90: New test.

LGTM.

Jakub



[Patch] OpenMP: Avoid ICE with LTO and 'omp allocate (was: [Patch] Fortran: Support OpenMP's 'allocate' directive for stack vars)

2023-10-18 Thread Tobias Burnus

On 18.10.23 11:36, Jakub Jelinek wrote:

On Wed, Oct 18, 2023 at 11:12:44AM +0200, Thomas Schwinge wrote:

 +FAIL: gfortran.dg/gomp/allocate-13.f90   -O  (internal compiler error: 
tree code 'statement_list' is not supported in LTO streams)

Any references to GENERIC code in clauses etc. should have been gimplified
or cleared during gimplification, we shouldn't support STATEMENT_LIST
in LTO streaming.


We only needed the statement list as aid during the gimplify.cc handling
of GOMP_alloc/GOMP_free for Fortran. How about just remove_attribute it
in that case? As discussed, as DECL_ATTRIBUTE gets added from the left
to the DECL_CHAIN, there shouldn't be a problem of introducing shared
trees; note that 'omp allocate' itself is added per DECL, i.e. it does
not introduce sharing itself, either.

Tested with x86-64-gnu-linux.

Tobias
-
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
München, HRB 106955
OpenMP: Avoid ICE with LTO and 'omp allocate'

gcc/ChangeLog:

	* gimplify.cc (gimplify_bind_expr): Remove "omp allocate" attribute
	to avoid that auxillary statement list reaches to LTO.

gcc/testsuite/ChangeLog:

	* gfortran.dg/gomp/allocate-13a.f90: New test.

 gcc/gimplify.cc | 18 -
 gcc/testsuite/gfortran.dg/gomp/allocate-13a.f90 | 34 +
 2 files changed, 45 insertions(+), 7 deletions(-)

diff --git a/gcc/gimplify.cc b/gcc/gimplify.cc
index 9c617c21381..22ff1075abb 100644
--- a/gcc/gimplify.cc
+++ b/gcc/gimplify.cc
@@ -1426,7 +1426,8 @@ gimplify_bind_expr (tree *expr_p, gimple_seq *pre_p)
 		  DECL_ATTRIBUTES (v)
 			= tree_cons (get_identifier ("omp allocate var"),
  build_tree_list (NULL_TREE, t),
- DECL_ATTRIBUTES (t));
+ remove_attribute ("omp allocate",
+		   DECL_ATTRIBUTES (t)));
 		  tmp = build_fold_indirect_ref (v);
 		  TREE_THIS_NOTRAP (tmp) = 1;
 		  SET_DECL_VALUE_EXPR (t, tmp);
@@ -1473,7 +1474,12 @@ gimplify_bind_expr (tree *expr_p, gimple_seq *pre_p)
 			 at the top, unless an allocator or size expression
 			 requires to put it afterward; note that the size is
 			 always later in generated code; for strings, no
-			 size expr but still an expr might be available.  */
+			 size expr but still an expr might be available.
+			 As LTO does not handle a statement list, 'sl' has
+			 to be removed; done so by removing the attribute.  */
+		  DECL_ATTRIBUTES (t)
+			= remove_attribute ("omp allocate",
+	DECL_ATTRIBUTES (t));
 		  tree sl = TREE_PURPOSE (TREE_CHAIN (TREE_VALUE (attr)));
 		  tree_stmt_iterator e = tsi_start (sl);
 		  tree needle = NULL_TREE;
@@ -1631,16 +1637,14 @@ gimplify_bind_expr (tree *expr_p, gimple_seq *pre_p)
 	  && !is_global_var (t)
 	  && DECL_CONTEXT (t) == current_function_decl)
 	{
-	  tree attr;
 	  if (flag_openmp
 	  && DECL_HAS_VALUE_EXPR_P (t)
 	  && TREE_USED (t)
-	  && ((attr = lookup_attribute ("omp allocate",
-	DECL_ATTRIBUTES (t))) != NULL_TREE)
-	  && TREE_CHAIN (TREE_VALUE (attr)) == NULL_TREE)
+	  && lookup_attribute ("omp allocate", DECL_ATTRIBUTES (t)))
 	{
 	  /* For Fortran, TREE_CHAIN (TREE_VALUE (attr)) is set, which
-		 causes that the GOMP_free call is already added above.  */
+		 causes that the GOMP_free call is already added above;
+		 and "omp allocate" is removed from DECL_ATTRIBUTES.  */
 	  tree v = TREE_OPERAND (DECL_VALUE_EXPR (t), 0);
 	  tree tmp = builtin_decl_explicit (BUILT_IN_GOMP_FREE);
 	  tmp = build_call_expr_loc (end_locus, tmp, 2, v,
diff --git a/gcc/testsuite/gfortran.dg/gomp/allocate-13a.f90 b/gcc/testsuite/gfortran.dg/gomp/allocate-13a.f90
new file mode 100644
index 000..4b297cdb4aa
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/gomp/allocate-13a.f90
@@ -0,0 +1,34 @@
+! { dg-do compile { target lto } }
+! { dg-additional-options "-flto" }
+
+! Same as allocate-13.f90 but compiled with -flto.
+
+! This was failing before as the statement list,
+! used for placing the GOMP_alloc/GOMP_free leaked
+! through to LTO.
+
+module m
+  implicit none
+  !$omp requires dynamic_allocators
+contains
+subroutine f ()
+  !$omp declare target
+  integer :: var
+  !$omp allocate(var)
+  var = 5
+end
+
+subroutine h ()
+  !$omp target
+   !$omp parallel
+!$omp single
+  block
+   integer :: var2(5)
+   !$omp allocate(var2)
+   var2(1) = 7
+  end block
+!$omp end single
+   !$omp end parallel
+  !$omp end target
+end
+end module


[PATCH v2] libstdc++: testsuite: Enhance codecvt_unicode with tests for length()

2023-10-18 Thread Dimitrij Mijoski
We can test codecvt::length() with the same data that we test
codecvt::in(). For each call of in() we add another call to length().
Some additional small cosmentic changes are applied.

libstdc++-v3/ChangeLog:

* testsuite/22_locale/codecvt/codecvt_unicode.h: Test length()
---
 .../22_locale/codecvt/codecvt_unicode.h   | 123 --
 1 file changed, 110 insertions(+), 13 deletions(-)

diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_unicode.h 
b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_unicode.h
index d3ae42fac..42270c50f 100644
--- a/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_unicode.h
+++ b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_unicode.h
@@ -17,7 +17,6 @@
 
 #include 
 #include 
-#include 
 #include 
 
 struct test_offsets_ok
@@ -79,6 +78,11 @@ utf8_to_utf32_in_ok (const std::codecvt )
   VERIFY (char_traits::compare (out, exp, t.out_size) == 0);
   if (t.out_size < array_size (out))
VERIFY (out[t.out_size] == 0);
+
+  state = {};
+  auto len = cvt.length (state, in, in + t.in_size, t.out_size);
+  VERIFY (len >= 0);
+  VERIFY (static_cast (len) == t.in_size);
 }
 
   for (auto t : offsets)
@@ -99,6 +103,11 @@ utf8_to_utf32_in_ok (const std::codecvt )
   VERIFY (char_traits::compare (out, exp, t.out_size) == 0);
   if (t.out_size < array_size (out))
VERIFY (out[t.out_size] == 0);
+
+  state = {};
+  auto len = cvt.length (state, in, in + t.in_size, array_size (out));
+  VERIFY (len >= 0);
+  VERIFY (static_cast (len) == t.in_size);
 }
 }
 
@@ -163,6 +172,11 @@ utf8_to_utf32_in_partial (const std::codecvt )
  == 0);
   if (t.expected_out_next < array_size (out))
VERIFY (out[t.expected_out_next] == 0);
+
+  state = {};
+  auto len = cvt.length (state, in, in + t.in_size, t.out_size);
+  VERIFY (len >= 0);
+  VERIFY (static_cast (len) == t.expected_in_next);
 }
 }
 
@@ -303,6 +317,11 @@ utf8_to_utf32_in_error (const std::codecvt )
   if (t.expected_out_next < array_size (out))
VERIFY (out[t.expected_out_next] == 0);
 
+  state = {};
+  auto len = cvt.length (state, in, in + t.in_size, t.out_size);
+  VERIFY (len >= 0);
+  VERIFY (static_cast (len) == t.expected_in_next);
+
   in[t.replace_pos] = old_char;
 }
 }
@@ -334,7 +353,7 @@ utf32_to_utf8_out_ok (const std::codecvt )
   VERIFY (char_traits::length (in) == 4);
   VERIFY (char_traits::length (exp) == 10);
 
-  const test_offsets_ok offsets[] = {{0, 0}, {1, 1}, {2, 3}, {3, 6}, {4, 10}};
+  test_offsets_ok offsets[] = {{0, 0}, {1, 1}, {2, 3}, {3, 6}, {4, 10}};
   for (auto t : offsets)
 {
   ExternT out[array_size (exp) - 1] = {};
@@ -374,7 +393,7 @@ utf32_to_utf8_out_partial (const std::codecvt )
   VERIFY (char_traits::length (in) == 4);
   VERIFY (char_traits::length (exp) == 10);
 
-  const test_offsets_partial offsets[] = {
+  test_offsets_partial offsets[] = {
 {1, 0, 0, 0}, // no space for first CP
 
 {2, 1, 1, 1}, // no space for second CP
@@ -528,6 +547,11 @@ utf8_to_utf16_in_ok (const std::codecvt )
   VERIFY (char_traits::compare (out, exp, t.out_size) == 0);
   if (t.out_size < array_size (out))
VERIFY (out[t.out_size] == 0);
+
+  state = {};
+  auto len = cvt.length (state, in, in + t.in_size, t.out_size);
+  VERIFY (len >= 0);
+  VERIFY (static_cast (len) == t.in_size);
 }
 
   for (auto t : offsets)
@@ -548,6 +572,11 @@ utf8_to_utf16_in_ok (const std::codecvt )
   VERIFY (char_traits::compare (out, exp, t.out_size) == 0);
   if (t.out_size < array_size (out))
VERIFY (out[t.out_size] == 0);
+
+  state = {};
+  auto len = cvt.length (state, in, in + t.in_size, array_size (out));
+  VERIFY (len >= 0);
+  VERIFY (static_cast (len) == t.in_size);
 }
 }
 
@@ -617,6 +646,11 @@ utf8_to_utf16_in_partial (const std::codecvt )
  == 0);
   if (t.expected_out_next < array_size (out))
VERIFY (out[t.expected_out_next] == 0);
+
+  state = {};
+  auto len = cvt.length (state, in, in + t.in_size, t.out_size);
+  VERIFY (len >= 0);
+  VERIFY (static_cast (len) == t.expected_in_next);
 }
 }
 
@@ -757,6 +791,11 @@ utf8_to_utf16_in_error (const std::codecvt )
   if (t.expected_out_next < array_size (out))
VERIFY (out[t.expected_out_next] == 0);
 
+  state = {};
+  auto len = cvt.length (state, in, in + t.in_size, t.out_size);
+  VERIFY (len >= 0);
+  VERIFY (static_cast (len) == t.expected_in_next);
+
   in[t.replace_pos] = old_char;
 }
 }
@@ -788,7 +827,7 @@ utf16_to_utf8_out_ok (const std::codecvt )
   VERIFY (char_traits::length (in) == 5);
   VERIFY (char_traits::length (exp) == 10);
 
-  const test_offsets_ok offsets[] = {{0, 0}, {1, 1}, {2, 3}, {3, 6}, {5, 10}};
+  test_offsets_ok offsets[] = {{0, 0}, {1, 1}, {2, 3}, {3, 6}, {5, 10}};
   for (auto t : offsets)
 {
   ExternT 

Re: [PATCH V2] RISC-V: Fix failed hoist in LICM of vmv.v.x instruction

2023-10-18 Thread juzhe.zh...@rivai.ai
More details of VSETVL bug:

Loop:
   10ddc:   9ed030d7vmv1r.v v1,v13
   10de0:   b21040d7vncvt.x.x.w v1,v1
   10de4:   5e0785d7vmv.v.v v11,v15
   10de8:   b700a5d7vmacc.vvv11,v1,v16
   10dec:   a6e8a0d7vmadd.vvv1,v17,v14
   10df0:   26b7b5d7vand.vi v11,v11,15
   10df4:   0c75f7d7vsetvli a5,a1,e8,mf2,ta,ma
   10df8:   0c707557vsetvli a0,zero,e8,mf2,ta,ma
   10dfc:   2617b0d7vand.vi v1,v1,15
   10e00:   0c75f057vsetvli zero,a1,e8,mf2,ta,ma
   10e04:   8d9dsub a1,a1,a5
   10e06:   020705a7vse8.v  v11,(a4)
   10e0a:   0c77f057vsetvli zero,a5,e8,mf2,ta,ma
   10e0e:   020685a7vse8.v  v11,(a3)
   10e12:   020600a7vse8.v  v1,(a2)
   10e16:   973eadd a4,a4,a5
   10e18:   0c807557vsetvli a0,zero,e16,m1,ta,ma
   10e1c:   96beadd a3,a3,a5
   10e1e:   963eadd a2,a2,a5
   10e20:   02d606d7vadd.vv v13,v13,v12
   10e24:   fdc5bneza1,10ddc 

The vncvt.x.x.w consume e16m1 VTYPE vsetvl but it shouldn't, it should be e8mf2.
This issue is fixed by recent refactor patch.


juzhe.zh...@rivai.ai
 
From: Juzhe-Zhong
Date: 2023-10-18 18:25
To: gcc-patches
CC: kito.cheng; kito.cheng; jeffreyalaw; rdapp.gcc; Juzhe-Zhong
Subject: [PATCH V2] RISC-V: Fix failed hoist in LICM of vmv.v.x instruction
Confirm dynamic LMUL algorithm works well for choosing LMUL = 4 for the PR:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111848
 
But it generate horrible register spillings.
 
The root cause is that we didn't hoist the vmv.v.x outside the loop which
increase the SLP loop register pressure.
 
So, change the COSNT_VECTOR move into vec_duplicate splitter that we can gain 
better optimizations:
 
1. better LICM.
2. More opportunities of transforming 'vv' into 'vx' in the future.
 
Before this patch:
 
f3:
ble a4,zero,.L8
csrrt0,vlenb
sllit1,t0,4
csrra6,vlenb
sub sp,sp,t1
csrra5,vlenb
sllia6,a6,3
sllia5,a5,2
add a6,a6,sp
vsetvli a7,zero,e16,m8,ta,ma
sllia4,a4,3
vid.v   v8
addit6,a5,-1
vand.vi v8,v8,-2
neg t5,a5
vs8r.v  v8,0(sp)
vadd.vi v8,v8,1
vs8r.v  v8,0(a6)
j   .L4
.L12:
vsetvli a7,zero,e16,m8,ta,ma
.L4:
csrrt0,vlenb
sllit0,t0,3
vl8re16.v   v16,0(sp)
add t0,t0,sp
vmv.v.x v8,t6
mv  t1,a4
vand.vv v24,v16,v8
mv  a6,a4
vl8re16.v   v16,0(t0)
vand.vv v8,v16,v8
bleua4,a5,.L3
mv  a6,a5
.L3:
vsetvli zero,a6,e8,m4,ta,ma
vle8.v  v20,0(a2)
vle8.v  v16,0(a3)
vsetvli a7,zero,e8,m4,ta,ma
vrgatherei16.vv v4,v20,v24
vadd.vv v4,v16,v4
vsetvli zero,a6,e8,m4,ta,ma
vse8.v  v4,0(a0)
vle8.v  v20,0(a2)
vsetvli a7,zero,e8,m4,ta,ma
vrgatherei16.vv v4,v20,v8
vadd.vv v4,v4,v16
vsetvli zero,a6,e8,m4,ta,ma
vse8.v  v4,0(a1)
add a4,a4,t5
add a0,a0,a5
add a3,a3,a5
add a1,a1,a5
add a2,a2,a5
bgtut1,a5,.L12
csrrt0,vlenb
sllit1,t0,4
add sp,sp,t1
jr  ra
.L8:
ret
 
After this patch:
 
f3:
ble a4,zero,.L6
csrr a6,vlenb
csrr a5,vlenb
slli a6,a6,2
slli a5,a5,2
addi a6,a6,-1
slli a4,a4,3
neg t5,a5
vsetvli t1,zero,e16,m8,ta,ma
vmv.v.x v24,a6
vid.v v8
vand.vi v8,v8,-2
vadd.vi v16,v8,1
vand.vv v8,v8,v24
vand.vv v16,v16,v24
.L4:
mv t1,a4
mv a6,a4
bleu a4,a5,.L3
mv a6,a5
.L3:
vsetvli zero,a6,e8,m4,ta,ma
vle8.v v28,0(a2)
vle8.v v24,0(a3)
vsetvli a7,zero,e8,m4,ta,ma
vrgatherei16.vv v4,v28,v8
vadd.vv v4,v24,v4
vsetvli zero,a6,e8,m4,ta,ma
vse8.v v4,0(a0)
vle8.v v28,0(a2)
vsetvli a7,zero,e8,m4,ta,ma
vrgatherei16.vv v4,v28,v16
vadd.vv v4,v4,v24
vsetvli zero,a6,e8,m4,ta,ma
vse8.v v4,0(a1)
add a4,a4,t5
add a0,a0,a5
add a3,a3,a5
add a1,a1,a5
add a2,a2,a5
bgtu t1,a5,.L4
.L6:
ret
 
Note that this patch triggers multiple FAILs:
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_arith_run-3.c execution test
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_arith_run-3.c execution test
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_arith_run-4.c execution test
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_arith_run-4.c execution test
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_arith_run-8.c execution test
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_arith_run-8.c execution test
FAIL: gcc.target/riscv/rvv/autovec/gather-scatter/strided_load_run-1.c 
execution test
FAIL: 

Re: [PATCH] Avoid compile time hog on vect_peel_nonlinear_iv_init for nonlinear induction vec_step_op_mul when iteration count is too big. 65;6800;1c There's loop in vect_peel_nonlinear_iv_init to get

2023-10-18 Thread Richard Biener
On Wed, 18 Oct 2023, liuhongt wrote:

> Also give up vectorization when niters_skip is negative which will be
> used for fully masked loop.
> 
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Ok for trunk?
> 
> gcc/ChangeLog:
> 
>   PR tree-optimization/111820
>   PR tree-optimization/111833
>   * tree-vect-loop-manip.cc (vect_can_peel_nonlinear_iv_p): Give
>   up vectorization for nonlinear iv vect_step_op_mul when
>   step_expr is not exact_log2 and niters is greater than
>   TYPE_PRECISION (TREE_TYPE (step_expr)). Also don't vectorize
>   for nagative niters_skip which will be used by fully masked
>   loop.
>   (vect_can_advance_ivs_p): Pass whole phi_info to
>   vect_can_peel_nonlinear_iv_p.
>   * tree-vect-loop.cc (vect_peel_nonlinear_iv_init): Optimize
>   init_expr * pow (step_expr, skipn) to init_expr
>   << (log2 (step_expr) * skipn) when step_expr is exact_log2.
> 
> gcc/testsuite/ChangeLog:
> 
>   * gcc.target/i386/pr111820-1.c: New test.
>   * gcc.target/i386/pr111820-2.c: New test.
>   * gcc.target/i386/pr103144-mul-1.c: Adjust testcase.
> ---
>  .../gcc.target/i386/pr103144-mul-1.c  |  6 ++--
>  gcc/testsuite/gcc.target/i386/pr111820-1.c| 16 ++
>  gcc/testsuite/gcc.target/i386/pr111820-2.c| 17 ++
>  gcc/tree-vect-loop-manip.cc   | 28 ++--
>  gcc/tree-vect-loop.cc | 32 ---
>  5 files changed, 88 insertions(+), 11 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr111820-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr111820-2.c
> 
> diff --git a/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c 
> b/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c
> index 640c34fd959..f80d1094097 100644
> --- a/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c
> +++ b/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c
> @@ -23,7 +23,7 @@ foo_mul_const (int* a)
>for (int i = 0; i != N; i++)
>  {
>a[i] = b;
> -  b *= 3;
> +  b *= 4;
>  }
>  }
>  
> @@ -34,7 +34,7 @@ foo_mul_peel (int* a, int b)
>for (int i = 0; i != 39; i++)
>  {
>a[i] = b;
> -  b *= 3;
> +  b *= 4;
>  }
>  }
>  
> @@ -46,6 +46,6 @@ foo_mul_peel_const (int* a)
>for (int i = 0; i != 39; i++)
>  {
>a[i] = b;
> -  b *= 3;
> +  b *= 4;
>  }
>  }
> diff --git a/gcc/testsuite/gcc.target/i386/pr111820-1.c 
> b/gcc/testsuite/gcc.target/i386/pr111820-1.c
> new file mode 100644
> index 000..50e960c39d4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr111820-1.c
> @@ -0,0 +1,16 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -mavx2 -fno-tree-vrp -Wno-aggressive-loop-optimizations 
> -fdump-tree-vect-details" } */
> +/* { dg-final { scan-tree-dump "Avoid compile time hog on 
> vect_peel_nonlinear_iv_init for nonlinear induction vec_step_op_mul when 
> iteration count is too big" "vect" } } */
> +
> +int r;
> +int r_0;
> +
> +void f1 (void)
> +{
> +  int n = 0;
> +  while (-- n)
> +{
> +  r_0 += r;
> +  r  *= 3;
> +}
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr111820-2.c 
> b/gcc/testsuite/gcc.target/i386/pr111820-2.c
> new file mode 100644
> index 000..bbdb40798c6
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr111820-2.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -mavx2 -fno-tree-vrp -fdump-tree-vect-details 
> -Wno-aggressive-loop-optimizations" } */
> +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
> +
> +int r;
> +int r_0;
> +
> +void f (void)
> +{
> +  int n = 0;
> +  while (-- n)
> +{
> +  r_0 += r ;
> +  r  *= 2;
> +}
> +}
> +
> diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> index 2608c286e5d..a530088b61d 100644
> --- a/gcc/tree-vect-loop-manip.cc
> +++ b/gcc/tree-vect-loop-manip.cc
> @@ -1783,8 +1783,10 @@ iv_phi_p (stmt_vec_info stmt_info)
>  /* Return true if vectorizer can peel for nonlinear iv.  */
>  static bool
>  vect_can_peel_nonlinear_iv_p (loop_vec_info loop_vinfo,
> -   enum vect_induction_op_type induction_type)
> +   stmt_vec_info stmt_info)
>  {
> +  enum vect_induction_op_type induction_type
> += STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
>tree niters_skip;
>/* Init_expr will be update by vect_update_ivs_after_vectorizer,
>   if niters or vf is unkown:
> @@ -1805,11 +1807,31 @@ vect_can_peel_nonlinear_iv_p (loop_vec_info 
> loop_vinfo,
>return false;
>  }
>  
> +  /* Avoid compile time hog on vect_peel_nonlinear_iv_init.  */
> +  if (induction_type == vect_step_op_mul)
> +{
> +  tree step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
> +  tree type = TREE_TYPE (step_expr);
> +
> +  if (wi::exact_log2 (wi::to_wide (step_expr)) == -1
> +   && LOOP_VINFO_INT_NITERS(loop_vinfo) >= TYPE_PRECISION (type))
> + {
> + 

Re: [PATCH] tree-ssa-math-opts: Fix up match_uaddc_usubc [PR111845]

2023-10-18 Thread Richard Biener
On Wed, 18 Oct 2023, Jakub Jelinek wrote:

> Hi!
> 
> GCC ICEs on the first testcase.  Successful match_uaddc_usubc ends up with
> some dead stmts which DCE will remove (hopefully) later all.
> The ICE is because one of the dead stmts refers to a freed SSA_NAME.
> The code already gsi_removes a couple of stmts in the
>   /* Remove some statements which can't be kept in the IL because they
>  use SSA_NAME whose setter is going to be removed too.  */
> section for the same reason (the reason for the freed SSA_NAMEs is that
> we don't really have a replacement for those cases - all we have after
> a match is combined overflow from the addition/subtraction of 2 operands + a
> [0, 1] carry in, but not the individual overflows from the former 2
> additions), but for the last (most significant) limb case, where we try
> to match x = op1 + op2 + carry1 + carry2; or
> x = op1 - op2 - carry1 - carry2; we just gsi_replace the final stmt, but
> left around the 2 temporary stmts as dead; if we were unlucky enough that
> those referenced the carry flag that went away, it ICEs.
> 
> So, the following patch remembers those temporary statements (rather than
> trying to rediscover them more expensively) and removes them before the
> final one is replaced.
> 
> While working on it, I've noticed we didn't support all the reassociated
> possibilities of writing the addition of 4 operands or subtracting 3
> operands from one, we supported e.g.
> x = ((op1 + op2) + op3) + op4;
> x = op1 + ((op2 + op3) + op4);
> but not
> x = (op1 + (op2 + op3)) + op4;
> x = op1 + (op2 + (op3 + op4));
> Fixed by the change to inspect also rhs[2] when rhs[1] didn't yield what
> we were searching for (if non-NULL) - rhs[0] is inspected in the first
> loop and has different handling for the MINUS_EXPR case.
> 
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

OK.

Richard.

> 2023-10-18  Jakub Jelinek  
> 
>   PR tree-optimization/111845
>   * tree-ssa-math-opts.cc (match_uaddc_usubc): Remember temporary
>   statements for the 4 operand addition or subtraction of 3 operands
>   from 1 operand cases and remove them when successful.  Look for
>   nested additions even from rhs[2], not just rhs[1].
> 
>   * gcc.dg/pr111845.c: New test.
>   * gcc.target/i386/pr111845.c: New test.
> 
> --- gcc/tree-ssa-math-opts.cc.jj  2023-09-18 10:37:56.180963000 +0200
> +++ gcc/tree-ssa-math-opts.cc 2023-10-17 14:46:39.430139602 +0200
> @@ -4581,6 +4581,7 @@ match_uaddc_usubc (gimple_stmt_iterator
>if (!INTEGRAL_TYPE_P (type) || !TYPE_UNSIGNED (type))
>  return false;
>  
> +  auto_vec temp_stmts;
>if (code != BIT_IOR_EXPR && code != BIT_XOR_EXPR)
>  {
>/* If overflow flag is ignored on the MSB limb, we can end up with
> @@ -4615,26 +4616,29 @@ match_uaddc_usubc (gimple_stmt_iterator
> rhs[0] = gimple_assign_rhs1 (g);
> tree  = rhs[2] ? rhs[3] : rhs[2];
> r = r2;
> +   temp_stmts.quick_push (g);
>   }
> else
>   break;
>   }
> -  while (TREE_CODE (rhs[1]) == SSA_NAME && !rhs[3])
> - {
> -   gimple *g = SSA_NAME_DEF_STMT (rhs[1]);
> -   if (has_single_use (rhs[1])
> -   && is_gimple_assign (g)
> -   && gimple_assign_rhs_code (g) == PLUS_EXPR)
> - {
> -   rhs[1] = gimple_assign_rhs1 (g);
> -   if (rhs[2])
> - rhs[3] = gimple_assign_rhs2 (g);
> -   else
> - rhs[2] = gimple_assign_rhs2 (g);
> - }
> -   else
> - break;
> - }
> +  for (int i = 1; i <= 2; ++i)
> + while (rhs[i] && TREE_CODE (rhs[i]) == SSA_NAME && !rhs[3])
> +   {
> + gimple *g = SSA_NAME_DEF_STMT (rhs[i]);
> + if (has_single_use (rhs[i])
> + && is_gimple_assign (g)
> + && gimple_assign_rhs_code (g) == PLUS_EXPR)
> +   {
> + rhs[i] = gimple_assign_rhs1 (g);
> + if (rhs[2])
> +   rhs[3] = gimple_assign_rhs2 (g);
> + else
> +   rhs[2] = gimple_assign_rhs2 (g);
> + temp_stmts.quick_push (g);
> +   }
> + else
> +   break;
> +   }
>/* If there are just 3 addends or one minuend and two subtrahends,
>check for UADDC or USUBC being pattern recognized earlier.
>Say r = op1 + op2 + ovf1 + ovf2; where the (ovf1 + ovf2) part
> @@ -5039,7 +5043,17 @@ match_uaddc_usubc (gimple_stmt_iterator
>g = gimple_build_assign (ilhs, IMAGPART_EXPR,
>  build1 (IMAGPART_EXPR, TREE_TYPE (ilhs), nlhs));
>if (rhs[2])
> -gsi_insert_before (gsi, g, GSI_SAME_STMT);
> +{
> +  gsi_insert_before (gsi, g, GSI_SAME_STMT);
> +  /* Remove some further statements which can't be kept in the IL because
> +  they can use SSA_NAMEs whose setter is going to be removed too.  */
> +  while (temp_stmts.length ())
> + {
> +   g = temp_stmts.pop ();
> +  

Re: [PATCH] libstdc++: testsuite: Enhance codecvt_unicode with tests for length()

2023-10-18 Thread Dimitrij Mijoski
On Wed, 2023-10-18 at 10:52 +0100, Jonathan Wakely wrote:
> On Tue, 17 Oct 2023 at 23:51, Dimitrij Mijoski  wrote:
> > 
> > We can test codecvt::length() with the same data that we test
> > codecvt::in(). For each call of in() we add another call to length().
> > Some additional small cosmentic changes are applied.
> 
> Thanks! I'll get this applied.

I think I have an improvement to this patch, see bellow.

> > @@ -79,6 +78,10 @@ utf8_to_utf32_in_ok (const std::codecvt > ExternT, mbstate_t> )
> >    VERIFY (char_traits::compare (out, exp, t.out_size) == 0);
> >    if (t.out_size < array_size (out))
> >     VERIFY (out[t.out_size] == 0);
> > +
> > +  state = {};
> > +  auto len = cvt.length (state, in, in + t.in_size, t.out_size);
> > +  VERIFY (len == t.in_size);
> >  }
> > 
> >    for (auto t : offsets)

Notice that codecvt::length() return type is (signed) int that should
never be negative. Still because t.in_size is size_t the assertion may
generate some warnings. In theory that assrtion can be done like this:

VERIFY(len >= 0);
VERIFY(static_cast(len) == t.in_size);


Re: [PATCH] RISC-V: Fix failed hoist in LICM of vmv.v.x instruction

2023-10-18 Thread juzhe.zh...@rivai.ai
Forget about this patch.

Commit log code example is wrong, fixed it in V2: 
https://gcc.gnu.org/pipermail/gcc-patches/2023-October/633420.html

Thanks.



juzhe.zh...@rivai.ai
 
From: Juzhe-Zhong
Date: 2023-10-18 18:21
To: gcc-patches
CC: kito.cheng; kito.cheng; jeffreyalaw; rdapp.gcc; Juzhe-Zhong
Subject: [PATCH] RISC-V: Fix failed hoist in LICM of vmv.v.x instruction
Confirm dynamic LMUL algorithm works well for choosing LMUL = 4 for the PR:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111848
 
But it generate horrible register spillings.
 
The root cause is that we didn't hoist the vmv.v.x outside the loop which
increase the SLP loop register pressure.
 
So, change the COSNT_VECTOR move into vec_duplicate splitter that we can gain 
better optimizations:
 
1. better LICM.
2. More opportunities of transforming 'vv' into 'vx' in the future.
 
Before this patch:
 
f3:
ble a4,zero,.L8
csrrt0,vlenb
sllit1,t0,4
csrra6,vlenb
sub sp,sp,t1
csrra5,vlenb
sllia6,a6,3
sllia5,a5,2
add a6,a6,sp
vsetvli a7,zero,e16,m8,ta,ma
sllia4,a4,3
vid.v   v8
addit6,a5,-1
vand.vi v8,v8,-2
neg t5,a5
vs8r.v  v8,0(sp)
vadd.vi v8,v8,1
vs8r.v  v8,0(a6)
j   .L4
.L12:
vsetvli a7,zero,e16,m8,ta,ma
.L4:
csrrt0,vlenb
sllit0,t0,3
vl8re16.v   v16,0(sp)
add t0,t0,sp
vmv.v.x v8,t6
mv  t1,a4
vand.vv v24,v16,v8
mv  a6,a4
vl8re16.v   v16,0(t0)
vand.vv v8,v16,v8
bleua4,a5,.L3
mv  a6,a5
.L3:
vsetvli zero,a6,e8,m4,ta,ma
vle8.v  v20,0(a2)
vle8.v  v16,0(a3)
vsetvli a7,zero,e8,m4,ta,ma
vrgatherei16.vv v4,v20,v24
vadd.vv v4,v16,v4
vsetvli zero,a6,e8,m4,ta,ma
vse8.v  v4,0(a0)
vle8.v  v20,0(a2)
vsetvli a7,zero,e8,m4,ta,ma
vrgatherei16.vv v4,v20,v8
vadd.vv v4,v4,v16
vsetvli zero,a6,e8,m4,ta,ma
vse8.v  v4,0(a1)
add a4,a4,t5
add a0,a0,a5
add a3,a3,a5
add a1,a1,a5
add a2,a2,a5
bgtut1,a5,.L12
csrrt0,vlenb
sllit1,t0,4
add sp,sp,t1
jr  ra
.L8:
ret
 
After this patch:
 
bar:
ble a3,zero,.L5
csrr a5,vlenb
csrr t1,vlenb
srli a5,a5,1
srli a7,t1,1
addi a5,a5,-1
vsetvli a4,zero,e32,m2,ta,ma
slli a3,a3,1
vmv.v.x v2,a5
vid.v v18
vmv.v.x v6,a1
vand.vi v10,v18,-2
vand.vi v0,v18,1
vadd.vi v16,v10,1
vmseq.vi v0,v0,1
vand.vv v10,v10,v2
vand.vv v16,v16,v2
slli t1,t1,1
vsetvli zero,a4,e32,m2,ta,ma
neg t3,a7
viota.m v4,v0
vsetvli a4,zero,e32,m2,ta,mu
vmv.v.x v8,a2
vrgather.vv v14,v6,v4
vrgather.vv v12,v8,v4
vmv.v.i v2,0
vrgather.vv v14,v8,v4,v0.t
vrgather.vv v12,v6,v4,v0.t
.L4:
mv a2,a3
mv a5,a3
bleu a3,a7,.L3
mv a5,a7
.L3:
vsetvli zero,a5,e32,m2,ta,ma
vle32.v v6,0(a0)
vsetvli a6,zero,e32,m2,ta,ma
add a3,a3,t3
vrgather.vv v4,v6,v10
vrgather.vv v8,v6,v16
vsub.vv v4,v4,v12
add a0,a0,t1
vsetvli zero,a5,e32,m2,tu,ma
vadd.vv v2,v2,v4
vmacc.vv v2,v14,v8
bgtu a2,a7,.L4
li a5,-1
vsetvli a6,zero,e32,m2,ta,ma
li a4,0
vmv.v.i v4,0
vmul.vx v0,v18,a5
vadd.vi v0,v0,-1
vand.vi v0,v0,1
vmseq.vv v0,v0,v4
vand.vi v18,v18,1
vmerge.vvm v6,v4,v2,v0
vmseq.vv v18,v18,v4
vmv.s.x v1,a4
vmv1r.v v0,v18
vredsum.vs v6,v6,v1
vmerge.vvm v4,v4,v2,v0
vmv.x.s a0,v6
vredsum.vs v4,v4,v1
vmv.x.s a5,v4
addw a0,a0,a5
ret
.L5:
li a0,0
ret
 
Note that this patch triggers multiple FAILs:
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_arith_run-3.c execution test
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_arith_run-3.c execution test
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_arith_run-4.c execution test
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_arith_run-4.c execution test
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_arith_run-8.c execution test
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_arith_run-8.c execution test
FAIL: gcc.target/riscv/rvv/autovec/gather-scatter/strided_load_run-1.c 
execution test
FAIL: gcc.target/riscv/rvv/autovec/gather-scatter/strided_load_run-1.c 
execution test
FAIL: gcc.target/riscv/rvv/autovec/gather-scatter/strided_load_run-2.c 
execution test
FAIL: gcc.target/riscv/rvv/autovec/gather-scatter/strided_load_run-2.c 
execution test
FAIL: gcc.target/riscv/rvv/autovec/gather-scatter/strided_store_run-1.c 
execution test
FAIL: gcc.target/riscv/rvv/autovec/gather-scatter/strided_store_run-1.c 
execution test
FAIL: gcc.target/riscv/rvv/autovec/gather-scatter/strided_store_run-2.c 
execution test
FAIL: gcc.target/riscv/rvv/autovec/gather-scatter/strided_store_run-2.c 
execution test
 
They failed are all because of bugs on VSETVL PASS:
 
10dd4:   0c707057vsetvli zero,zero,e8,mf2,ta,ma
   10dd8:   5e06b8d7vmv.v.i v17,13
   10ddc:   9ed030d7vmv1r.v v1,v13
   

[PATCH V2] RISC-V: Fix failed hoist in LICM of vmv.v.x instruction

2023-10-18 Thread Juzhe-Zhong
Confirm dynamic LMUL algorithm works well for choosing LMUL = 4 for the PR:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111848

But it generate horrible register spillings.

The root cause is that we didn't hoist the vmv.v.x outside the loop which
increase the SLP loop register pressure.

So, change the COSNT_VECTOR move into vec_duplicate splitter that we can gain 
better optimizations:

1. better LICM.
2. More opportunities of transforming 'vv' into 'vx' in the future.

Before this patch:

f3:
ble a4,zero,.L8
csrrt0,vlenb
sllit1,t0,4
csrra6,vlenb
sub sp,sp,t1
csrra5,vlenb
sllia6,a6,3
sllia5,a5,2
add a6,a6,sp
vsetvli a7,zero,e16,m8,ta,ma
sllia4,a4,3
vid.v   v8
addit6,a5,-1
vand.vi v8,v8,-2
neg t5,a5
vs8r.v  v8,0(sp)
vadd.vi v8,v8,1
vs8r.v  v8,0(a6)
j   .L4
.L12:
vsetvli a7,zero,e16,m8,ta,ma
.L4:
csrrt0,vlenb
sllit0,t0,3
vl8re16.v   v16,0(sp)
add t0,t0,sp
vmv.v.x v8,t6
mv  t1,a4
vand.vv v24,v16,v8
mv  a6,a4
vl8re16.v   v16,0(t0)
vand.vv v8,v16,v8
bleua4,a5,.L3
mv  a6,a5
.L3:
vsetvli zero,a6,e8,m4,ta,ma
vle8.v  v20,0(a2)
vle8.v  v16,0(a3)
vsetvli a7,zero,e8,m4,ta,ma
vrgatherei16.vv v4,v20,v24
vadd.vv v4,v16,v4
vsetvli zero,a6,e8,m4,ta,ma
vse8.v  v4,0(a0)
vle8.v  v20,0(a2)
vsetvli a7,zero,e8,m4,ta,ma
vrgatherei16.vv v4,v20,v8
vadd.vv v4,v4,v16
vsetvli zero,a6,e8,m4,ta,ma
vse8.v  v4,0(a1)
add a4,a4,t5
add a0,a0,a5
add a3,a3,a5
add a1,a1,a5
add a2,a2,a5
bgtut1,a5,.L12
csrrt0,vlenb
sllit1,t0,4
add sp,sp,t1
jr  ra
.L8:
ret

After this patch:

f3:
ble a4,zero,.L6
csrra6,vlenb
csrra5,vlenb
sllia6,a6,2
sllia5,a5,2
addia6,a6,-1
sllia4,a4,3
neg t5,a5
vsetvli t1,zero,e16,m8,ta,ma
vmv.v.x v24,a6
vid.v   v8
vand.vi v8,v8,-2
vadd.vi v16,v8,1
vand.vv v8,v8,v24
vand.vv v16,v16,v24
.L4:
mv  t1,a4
mv  a6,a4
bleua4,a5,.L3
mv  a6,a5
.L3:
vsetvli zero,a6,e8,m4,ta,ma
vle8.v  v28,0(a2)
vle8.v  v24,0(a3)
vsetvli a7,zero,e8,m4,ta,ma
vrgatherei16.vv v4,v28,v8
vadd.vv v4,v24,v4
vsetvli zero,a6,e8,m4,ta,ma
vse8.v  v4,0(a0)
vle8.v  v28,0(a2)
vsetvli a7,zero,e8,m4,ta,ma
vrgatherei16.vv v4,v28,v16
vadd.vv v4,v4,v24
vsetvli zero,a6,e8,m4,ta,ma
vse8.v  v4,0(a1)
add a4,a4,t5
add a0,a0,a5
add a3,a3,a5
add a1,a1,a5
add a2,a2,a5
bgtut1,a5,.L4
.L6:
ret

Note that this patch triggers multiple FAILs:
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_arith_run-3.c execution test
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_arith_run-3.c execution test
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_arith_run-4.c execution test
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_arith_run-4.c execution test
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_arith_run-8.c execution test
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_arith_run-8.c execution test
FAIL: gcc.target/riscv/rvv/autovec/gather-scatter/strided_load_run-1.c 
execution test
FAIL: gcc.target/riscv/rvv/autovec/gather-scatter/strided_load_run-1.c 
execution test
FAIL: gcc.target/riscv/rvv/autovec/gather-scatter/strided_load_run-2.c 
execution test
FAIL: gcc.target/riscv/rvv/autovec/gather-scatter/strided_load_run-2.c 
execution test
FAIL: gcc.target/riscv/rvv/autovec/gather-scatter/strided_store_run-1.c 
execution test
FAIL: gcc.target/riscv/rvv/autovec/gather-scatter/strided_store_run-1.c 
execution test
FAIL: gcc.target/riscv/rvv/autovec/gather-scatter/strided_store_run-2.c 
execution test
FAIL: gcc.target/riscv/rvv/autovec/gather-scatter/strided_store_run-2.c 
execution test

They failed are all because of bugs on VSETVL PASS:

10dd4:   0c707057vsetvli zero,zero,e8,mf2,ta,ma
   10dd8:   5e06b8d7vmv.v.i v17,13
   10ddc:   9ed030d7vmv1r.v v1,v13
   10de0:   b21040d7vncvt.x.x.w v1,v1   > 
raise illegal instruction since we don't have SEW = 8 -> SEW = 4 narrowing.
   10de4:   5e0785d7vmv.v.v v11,v15

Confirm the recent VSETVL refactor patch: 
https://gcc.gnu.org/pipermail/gcc-patches/2023-October/633231.html fixed all of 
them.

So this patch should be committed after the VSETVL refactor patch.

PR target/111848


Re: [PATCH] Support g++ 4.8 as a host compiler.

2023-10-18 Thread Richard Sandiford
Jakub Jelinek  writes:
> On Sun, Oct 15, 2023 at 12:43:10PM +0100, Richard Sandiford wrote:
>> It seemed like there was considerable support for bumping the minimum
>> to beyond 4.8.  I think we should wait until a decision has been made
>> before adding more 4.8 workarounds.
>
> I think adding a workaround until that decision is made and perhaps
> removing it afterwards will make life easier for people still using gcc 4.8.
>
>> Having a conditional explicit constructor is dangerous because it changes
>> semantics.  E.g. consider:
>> 
>>   #include 
>> 
>>   union u { int x; };
>>   void f(u *ptr) { new(ptr) u; }
>>   void g(u *ptr) { new(ptr) u(); }
>> 
>> g(ptr) zeros ptr->x whereas f(ptr) doesn't.  If we add "u() {}" then g()
>> does not zero ptr->x.
>> 
>> So if we did add the workaround, it would need to be unconditional,
>> like you say.
>
> What about using more directed workaround then?
>
> Like (just stage1 build tested, perhaps with comment why we do that)
> below?  Seems at least in stage1 it is the only problematic spot.
>
> --- a/gcc/cse.cc
> +++ b/gcc/cse.cc
> @@ -4951,8 +4951,14 @@ cse_insn (rtx_insn *insn)
>   && is_a  (mode, _mode)
>   && (extend_op = load_extend_op (int_mode)) != UNKNOWN)
> {
> +#if GCC_VERSION >= 5000
>   struct rtx_def memory_extend_buf;
>   rtx memory_extend_rtx = _extend_buf;
> +#else
> + alignas (alignof (rtx_def)) unsigned char
> +   memory_extended_buf[sizeof (rtx_def)];

Looks like the simpler "alignas (rtx_def)" should work.

LGTM otherwise FWIW.

Richard

> + rtx memory_extend_rtx = (rtx) _extended_buf[0];
> +#endif
>  
>   /* Set what we are trying to extend and the operation it might
>  have been extended with.  */
>
>
>   Jakub


[PATCH] RISC-V: Fix failed hoist in LICM of vmv.v.x instruction

2023-10-18 Thread Juzhe-Zhong
Confirm dynamic LMUL algorithm works well for choosing LMUL = 4 for the PR:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111848

But it generate horrible register spillings.

The root cause is that we didn't hoist the vmv.v.x outside the loop which
increase the SLP loop register pressure.

So, change the COSNT_VECTOR move into vec_duplicate splitter that we can gain 
better optimizations:

1. better LICM.
2. More opportunities of transforming 'vv' into 'vx' in the future.

Before this patch:

f3:
ble a4,zero,.L8
csrrt0,vlenb
sllit1,t0,4
csrra6,vlenb
sub sp,sp,t1
csrra5,vlenb
sllia6,a6,3
sllia5,a5,2
add a6,a6,sp
vsetvli a7,zero,e16,m8,ta,ma
sllia4,a4,3
vid.v   v8
addit6,a5,-1
vand.vi v8,v8,-2
neg t5,a5
vs8r.v  v8,0(sp)
vadd.vi v8,v8,1
vs8r.v  v8,0(a6)
j   .L4
.L12:
vsetvli a7,zero,e16,m8,ta,ma
.L4:
csrrt0,vlenb
sllit0,t0,3
vl8re16.v   v16,0(sp)
add t0,t0,sp
vmv.v.x v8,t6
mv  t1,a4
vand.vv v24,v16,v8
mv  a6,a4
vl8re16.v   v16,0(t0)
vand.vv v8,v16,v8
bleua4,a5,.L3
mv  a6,a5
.L3:
vsetvli zero,a6,e8,m4,ta,ma
vle8.v  v20,0(a2)
vle8.v  v16,0(a3)
vsetvli a7,zero,e8,m4,ta,ma
vrgatherei16.vv v4,v20,v24
vadd.vv v4,v16,v4
vsetvli zero,a6,e8,m4,ta,ma
vse8.v  v4,0(a0)
vle8.v  v20,0(a2)
vsetvli a7,zero,e8,m4,ta,ma
vrgatherei16.vv v4,v20,v8
vadd.vv v4,v4,v16
vsetvli zero,a6,e8,m4,ta,ma
vse8.v  v4,0(a1)
add a4,a4,t5
add a0,a0,a5
add a3,a3,a5
add a1,a1,a5
add a2,a2,a5
bgtut1,a5,.L12
csrrt0,vlenb
sllit1,t0,4
add sp,sp,t1
jr  ra
.L8:
ret

After this patch:

bar:
ble a3,zero,.L5
csrra5,vlenb
csrrt1,vlenb
srlia5,a5,1
srlia7,t1,1
addia5,a5,-1
vsetvli a4,zero,e32,m2,ta,ma
sllia3,a3,1
vmv.v.x v2,a5
vid.v   v18
vmv.v.x v6,a1
vand.vi v10,v18,-2
vand.vi v0,v18,1
vadd.vi v16,v10,1
vmseq.viv0,v0,1
vand.vv v10,v10,v2
vand.vv v16,v16,v2
sllit1,t1,1
vsetvli zero,a4,e32,m2,ta,ma
neg t3,a7
viota.m v4,v0
vsetvli a4,zero,e32,m2,ta,mu
vmv.v.x v8,a2
vrgather.vv v14,v6,v4
vrgather.vv v12,v8,v4
vmv.v.i v2,0
vrgather.vv v14,v8,v4,v0.t
vrgather.vv v12,v6,v4,v0.t
.L4:
mv  a2,a3
mv  a5,a3
bleua3,a7,.L3
mv  a5,a7
.L3:
vsetvli zero,a5,e32,m2,ta,ma
vle32.v v6,0(a0)
vsetvli a6,zero,e32,m2,ta,ma
add a3,a3,t3
vrgather.vv v4,v6,v10
vrgather.vv v8,v6,v16
vsub.vv v4,v4,v12
add a0,a0,t1
vsetvli zero,a5,e32,m2,tu,ma
vadd.vv v2,v2,v4
vmacc.vvv2,v14,v8
bgtua2,a7,.L4
li  a5,-1
vsetvli a6,zero,e32,m2,ta,ma
li  a4,0
vmv.v.i v4,0
vmul.vx v0,v18,a5
vadd.vi v0,v0,-1
vand.vi v0,v0,1
vmseq.vvv0,v0,v4
vand.vi v18,v18,1
vmerge.vvm  v6,v4,v2,v0
vmseq.vvv18,v18,v4
vmv.s.x v1,a4
vmv1r.v v0,v18
vredsum.vs  v6,v6,v1
vmerge.vvm  v4,v4,v2,v0
vmv.x.s a0,v6
vredsum.vs  v4,v4,v1
vmv.x.s a5,v4
addwa0,a0,a5
ret
.L5:
li  a0,0
ret

Note that this patch triggers multiple FAILs:
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_arith_run-3.c execution test
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_arith_run-3.c execution test
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_arith_run-4.c execution test
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_arith_run-4.c execution test
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_arith_run-8.c execution test
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_arith_run-8.c execution test
FAIL: gcc.target/riscv/rvv/autovec/gather-scatter/strided_load_run-1.c 
execution test
FAIL: gcc.target/riscv/rvv/autovec/gather-scatter/strided_load_run-1.c 
execution test
FAIL: gcc.target/riscv/rvv/autovec/gather-scatter/strided_load_run-2.c 
execution test
FAIL: gcc.target/riscv/rvv/autovec/gather-scatter/strided_load_run-2.c 
execution test
FAIL: gcc.target/riscv/rvv/autovec/gather-scatter/strided_store_run-1.c 
execution test
FAIL: gcc.target/riscv/rvv/autovec/gather-scatter/strided_store_run-1.c 
execution test
FAIL: gcc.target/riscv/rvv/autovec/gather-scatter/strided_store_run-2.c 
execution test
FAIL: 

Re: [PATCH] Support g++ 4.8 as a host compiler.

2023-10-18 Thread Jakub Jelinek
On Sun, Oct 15, 2023 at 12:43:10PM +0100, Richard Sandiford wrote:
> It seemed like there was considerable support for bumping the minimum
> to beyond 4.8.  I think we should wait until a decision has been made
> before adding more 4.8 workarounds.

I think adding a workaround until that decision is made and perhaps
removing it afterwards will make life easier for people still using gcc 4.8.

> Having a conditional explicit constructor is dangerous because it changes
> semantics.  E.g. consider:
> 
>   #include 
> 
>   union u { int x; };
>   void f(u *ptr) { new(ptr) u; }
>   void g(u *ptr) { new(ptr) u(); }
> 
> g(ptr) zeros ptr->x whereas f(ptr) doesn't.  If we add "u() {}" then g()
> does not zero ptr->x.
> 
> So if we did add the workaround, it would need to be unconditional,
> like you say.

What about using more directed workaround then?

Like (just stage1 build tested, perhaps with comment why we do that)
below?  Seems at least in stage1 it is the only problematic spot.

--- a/gcc/cse.cc
+++ b/gcc/cse.cc
@@ -4951,8 +4951,14 @@ cse_insn (rtx_insn *insn)
  && is_a  (mode, _mode)
  && (extend_op = load_extend_op (int_mode)) != UNKNOWN)
{
+#if GCC_VERSION >= 5000
  struct rtx_def memory_extend_buf;
  rtx memory_extend_rtx = _extend_buf;
+#else
+ alignas (alignof (rtx_def)) unsigned char
+   memory_extended_buf[sizeof (rtx_def)];
+ rtx memory_extend_rtx = (rtx) _extended_buf[0];
+#endif
 
  /* Set what we are trying to extend and the operation it might
 have been extended with.  */


Jakub



  1   2   >