Re: [PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops

2023-12-20 Thread Andre Vieira (lists)
Squashed the definition and changes to predicated_doloop_end_internal 
and dlstp*_insn into this patch to make sure the first patch builds 
independently


On 18/12/2023 11:53, Andre Vieira wrote:


Reworked Stam's patch after comments in:
https://gcc.gnu.org/pipermail/gcc-patches/2023-December/640362.html

The original gcc ChangeLog remains unchanged, but I did split up some tests so
here is the testsuite ChangeLog.


gcc/testsuite/ChangeLog:

* gcc.target/arm/lob.h: Update framework.
* gcc.target/arm/lob1.c: Likewise.
* gcc.target/arm/lob6.c: Likewise.
* gcc.target/arm/mve/dlstp-compile-asm.c: New test.
* gcc.target/arm/mve/dlstp-int16x8.c: New test.
* gcc.target/arm/mve/dlstp-int16x8-run.c: New test.
* gcc.target/arm/mve/dlstp-int32x4.c: New test.
* gcc.target/arm/mve/dlstp-int32x4-run.c: New test.
* gcc.target/arm/mve/dlstp-int64x2.c: New test.
* gcc.target/arm/mve/dlstp-int64x2-run.c: New test.
* gcc.target/arm/mve/dlstp-int8x16.c: New test.
* gcc.target/arm/mve/dlstp-int8x16-run.c: New test.
* gcc.target/arm/mve/dlstp-invalid-asm.c: New test.
diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index 
2f5ca79ed8ddd647b212782a0454ee4fefc07257..4f164c547406c43219900c111401540c7ef9d7d1
 100644
--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -65,8 +65,8 @@ extern void arm_emit_speculation_barrier_function (void);
 extern void arm_decompose_di_binop (rtx, rtx, rtx *, rtx *, rtx *, rtx *);
 extern bool arm_q_bit_access (void);
 extern bool arm_ge_bits_access (void);
-extern bool arm_target_insn_ok_for_lob (rtx);
-
+extern bool arm_target_bb_ok_for_lob (basic_block);
+extern rtx arm_attempt_dlstp_transform (rtx);
 #ifdef RTX_CODE
 enum reg_class
 arm_mode_base_reg_class (machine_mode);
diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc
index 
0c0cb14a8a4f043357b8acd7042a9f9386af1eb1..1ee72bcb7ec4bea5feea8453ceef7702b0088a73
 100644
--- a/gcc/config/arm/arm.cc
+++ b/gcc/config/arm/arm.cc
@@ -668,6 +668,12 @@ static const scoped_attribute_specs *const 
arm_attribute_table[] =
 #undef TARGET_HAVE_CONDITIONAL_EXECUTION
 #define TARGET_HAVE_CONDITIONAL_EXECUTION arm_have_conditional_execution
 
+#undef TARGET_LOOP_UNROLL_ADJUST
+#define TARGET_LOOP_UNROLL_ADJUST arm_loop_unroll_adjust
+
+#undef TARGET_PREDICT_DOLOOP_P
+#define TARGET_PREDICT_DOLOOP_P arm_predict_doloop_p
+
 #undef TARGET_LEGITIMATE_CONSTANT_P
 #define TARGET_LEGITIMATE_CONSTANT_P arm_legitimate_constant_p
 
@@ -34483,19 +34489,1147 @@ arm_invalid_within_doloop (const rtx_insn *insn)
 }
 
 bool
-arm_target_insn_ok_for_lob (rtx insn)
+arm_target_bb_ok_for_lob (basic_block bb)
 {
-  basic_block bb = BLOCK_FOR_INSN (insn);
   /* Make sure the basic block of the target insn is a simple latch
  having as single predecessor and successor the body of the loop
  itself.  Only simple loops with a single basic block as body are
  supported for 'low over head loop' making sure that LE target is
  above LE itself in the generated code.  */
-
   return single_succ_p (bb)
-&& single_pred_p (bb)
-&& single_succ_edge (bb)->dest == single_pred_edge (bb)->src
-&& contains_no_active_insn_p (bb);
+&& single_pred_p (bb)
+&& single_succ_edge (bb)->dest == single_pred_edge (bb)->src;
+}
+
+/* Utility fuction: Given a VCTP or a VCTP_M insn, return the number of MVE
+   lanes based on the machine mode being used.  */
+
+static int
+arm_mve_get_vctp_lanes (rtx_insn *insn)
+{
+  rtx insn_set = single_set (insn);
+  if (insn_set
+  && GET_CODE (SET_SRC (insn_set)) == UNSPEC
+  && (XINT (SET_SRC (insn_set), 1) == VCTP
+ || XINT (SET_SRC (insn_set), 1) == VCTP_M))
+{
+  machine_mode mode = GET_MODE (SET_SRC (insn_set));
+  return (VECTOR_MODE_P (mode) && VALID_MVE_PRED_MODE (mode))
+? GET_MODE_NUNITS (mode) : 0;
+}
+  return 0;
+}
+
+/* Check if INSN requires the use of the VPR reg, if it does, return the
+   sub-rtx of the VPR reg.  The TYPE argument controls whether
+   this function should:
+   * For TYPE == 0, check all operands, including the OUT operands,
+ and return the first occurrence of the VPR reg.
+   * For TYPE == 1, only check the input operands.
+   * For TYPE == 2, only check the output operands.
+   (INOUT operands are considered both as input and output operands)
+*/
+static rtx
+arm_get_required_vpr_reg (rtx_insn *insn, unsigned int type = 0)
+{
+  gcc_assert (type < 3);
+  if (!NONJUMP_INSN_P (insn))
+return NULL_RTX;
+
+  bool requires_vpr;
+  extract_constrain_insn (insn);
+  int n_operands = recog_data.n_operands;
+  if (recog_data.n_alternatives == 0)
+return NULL_RTX;
+
+  /* Fill in recog_op_alt with information about the constraints of
+ this insn.  */
+  preprocess_constraints (insn);
+
+  for (int op = 0; op < n_operands; op++)
+{
+  requires_vpr = true;
+  if (type == 1 && 

[PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops

2023-12-18 Thread Andre Vieira

Reworked Stam's patch after comments in:
https://gcc.gnu.org/pipermail/gcc-patches/2023-December/640362.html

The original gcc ChangeLog remains unchanged, but I did split up some tests so
here is the testsuite ChangeLog.


gcc/testsuite/ChangeLog:

* gcc.target/arm/lob.h: Update framework.
* gcc.target/arm/lob1.c: Likewise.
* gcc.target/arm/lob6.c: Likewise.
* gcc.target/arm/mve/dlstp-compile-asm.c: New test.
* gcc.target/arm/mve/dlstp-int16x8.c: New test.
* gcc.target/arm/mve/dlstp-int16x8-run.c: New test.
* gcc.target/arm/mve/dlstp-int32x4.c: New test.
* gcc.target/arm/mve/dlstp-int32x4-run.c: New test.
* gcc.target/arm/mve/dlstp-int64x2.c: New test.
* gcc.target/arm/mve/dlstp-int64x2-run.c: New test.
* gcc.target/arm/mve/dlstp-int8x16.c: New test.
* gcc.target/arm/mve/dlstp-int8x16-run.c: New test.
* gcc.target/arm/mve/dlstp-invalid-asm.c: New test.

diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index 2f5ca79ed8d..4f164c54740 100644
--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -65,8 +65,8 @@ extern void arm_emit_speculation_barrier_function (void);
 extern void arm_decompose_di_binop (rtx, rtx, rtx *, rtx *, rtx *, rtx *);
 extern bool arm_q_bit_access (void);
 extern bool arm_ge_bits_access (void);
-extern bool arm_target_insn_ok_for_lob (rtx);
-
+extern bool arm_target_bb_ok_for_lob (basic_block);
+extern rtx arm_attempt_dlstp_transform (rtx);
 #ifdef RTX_CODE
 enum reg_class
 arm_mode_base_reg_class (machine_mode);
diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc
index 0c0cb14a8a4..1ee72bcb7ec 100644
--- a/gcc/config/arm/arm.cc
+++ b/gcc/config/arm/arm.cc
@@ -668,6 +668,12 @@ static const scoped_attribute_specs *const arm_attribute_table[] =
 #undef TARGET_HAVE_CONDITIONAL_EXECUTION
 #define TARGET_HAVE_CONDITIONAL_EXECUTION arm_have_conditional_execution
 
+#undef TARGET_LOOP_UNROLL_ADJUST
+#define TARGET_LOOP_UNROLL_ADJUST arm_loop_unroll_adjust
+
+#undef TARGET_PREDICT_DOLOOP_P
+#define TARGET_PREDICT_DOLOOP_P arm_predict_doloop_p
+
 #undef TARGET_LEGITIMATE_CONSTANT_P
 #define TARGET_LEGITIMATE_CONSTANT_P arm_legitimate_constant_p
 
@@ -34483,19 +34489,1147 @@ arm_invalid_within_doloop (const rtx_insn *insn)
 }
 
 bool
-arm_target_insn_ok_for_lob (rtx insn)
+arm_target_bb_ok_for_lob (basic_block bb)
 {
-  basic_block bb = BLOCK_FOR_INSN (insn);
   /* Make sure the basic block of the target insn is a simple latch
  having as single predecessor and successor the body of the loop
  itself.  Only simple loops with a single basic block as body are
  supported for 'low over head loop' making sure that LE target is
  above LE itself in the generated code.  */
-
   return single_succ_p (bb)
-&& single_pred_p (bb)
-&& single_succ_edge (bb)->dest == single_pred_edge (bb)->src
-&& contains_no_active_insn_p (bb);
+	 && single_pred_p (bb)
+	 && single_succ_edge (bb)->dest == single_pred_edge (bb)->src;
+}
+
+/* Utility fuction: Given a VCTP or a VCTP_M insn, return the number of MVE
+   lanes based on the machine mode being used.  */
+
+static int
+arm_mve_get_vctp_lanes (rtx_insn *insn)
+{
+  rtx insn_set = single_set (insn);
+  if (insn_set
+  && GET_CODE (SET_SRC (insn_set)) == UNSPEC
+  && (XINT (SET_SRC (insn_set), 1) == VCTP
+	  || XINT (SET_SRC (insn_set), 1) == VCTP_M))
+{
+  machine_mode mode = GET_MODE (SET_SRC (insn_set));
+  return (VECTOR_MODE_P (mode) && VALID_MVE_PRED_MODE (mode))
+	 ? GET_MODE_NUNITS (mode) : 0;
+}
+  return 0;
+}
+
+/* Check if INSN requires the use of the VPR reg, if it does, return the
+   sub-rtx of the VPR reg.  The TYPE argument controls whether
+   this function should:
+   * For TYPE == 0, check all operands, including the OUT operands,
+ and return the first occurrence of the VPR reg.
+   * For TYPE == 1, only check the input operands.
+   * For TYPE == 2, only check the output operands.
+   (INOUT operands are considered both as input and output operands)
+*/
+static rtx
+arm_get_required_vpr_reg (rtx_insn *insn, unsigned int type = 0)
+{
+  gcc_assert (type < 3);
+  if (!NONJUMP_INSN_P (insn))
+return NULL_RTX;
+
+  bool requires_vpr;
+  extract_constrain_insn (insn);
+  int n_operands = recog_data.n_operands;
+  if (recog_data.n_alternatives == 0)
+return NULL_RTX;
+
+  /* Fill in recog_op_alt with information about the constraints of
+ this insn.  */
+  preprocess_constraints (insn);
+
+  for (int op = 0; op < n_operands; op++)
+{
+  requires_vpr = true;
+  if (type == 1 && recog_data.operand_type[op] == OP_OUT)
+	continue;
+  else if (type == 2 && recog_data.operand_type[op] == OP_IN)
+	continue;
+
+  /* Iterate through alternatives of operand "op" in recog_op_alt and
+	 identify if the operand is required to be the VPR.  */
+  for (int alt = 0; alt < recog_data.n_alternatives; alt++)
+	{
+	  const 

Re: [PING][PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops

2023-12-12 Thread Richard Earnshaw




On 30/11/2023 12:55, Stamatis Markianos-Wright wrote:

Hi Andre,

Thanks for the comments, see latest revision attached.

On 27/11/2023 12:47, Andre Vieira (lists) wrote:

Hi Stam,

Just some comments.

+/* Recursively scan through the DF chain backwards within the basic 
block and
+   determine if any of the USEs of the original insn (or the USEs of 
the insns
s/Recursively scan/Scan/ as you no longer recurse, thanks for that by 
the way :) +   where thy were DEF-ed, etc., recursively) were affected 
by implicit VPT

remove recursively for the same reasons.

+  if (!CONST_INT_P (cond_counter_iv.step) || !CONST_INT_P 
(cond_temp_iv.step))

+    return NULL;
+  /* Look at the steps and swap around the rtx's if needed. Error 
out if

+ one of them cannot be identified as constant.  */
+  if (INTVAL (cond_counter_iv.step) != 0 && INTVAL 
(cond_temp_iv.step) != 0)

+    return NULL;

Move the comment above the if before, as the erroring out it talks 
about is there.

Done


+  emit_note_after ((enum insn_note)NOTE_KIND (insn), BB_END (body));
 space after 'insn_note)'

@@ -173,14 +176,14 @@ doloop_condition_get (rtx_insn *doloop_pat)
   if (! REG_P (reg))
 return 0;
 -  /* Check if something = (plus (reg) (const_int -1)).
+  /* Check if something = (plus (reg) (const_int -n)).
  On IA-64, this decrement is wrapped in an if_then_else.  */
   inc_src = SET_SRC (inc);
   if (GET_CODE (inc_src) == IF_THEN_ELSE)
 inc_src = XEXP (inc_src, 1);
   if (GET_CODE (inc_src) != PLUS
   || XEXP (inc_src, 0) != reg
-  || XEXP (inc_src, 1) != constm1_rtx)
+  || !CONST_INT_P (XEXP (inc_src, 1)))

Do we ever check that inc_src is negative? We used to check if it was 
-1, now we only check it's a constnat, but not a negative one, so I 
suspect this needs a:

|| INTVAL (XEXP (inc_src, 1)) >= 0

Good point. Done


@@ -492,7 +519,8 @@ doloop_modify (class loop *loop, class niter_desc 
*desc,

 case GE:
   /* Currently only GE tests against zero are supported.  */
   gcc_assert (XEXP (condition, 1) == const0_rtx);
-
+  /* FALLTHRU */
+    case GTU:
   noloop = constm1_rtx;

I spent a very long time staring at this trying to understand why 
noloop = constm1_rtx for GTU, where I thought it should've been (count 
& (n-1)). For the current use of doloop it doesn't matter because ARM 
is the only target using it and you set desc->noloop_assumptions to 
null_rtx in 'arm_attempt_dlstp_transform' so noloop is never used. 
However, if a different target accepts this GTU pattern then this 
target agnostic code will do the wrong thing.  I suggest we either:
 - set noloop to what we think might be the correct value, which if 
you ask me should be 'count & (XEXP (condition, 1))',
 - or add a gcc_assert (GET_CODE (condition) != GTU); under the if 
(desc->noloop_assumption); part and document why.  I have a slight 
preference for the assert given otherwise we are adding code that we 
can't test.


Yea, that's true tbh. I've done the latter, but also separated out the 
"case GTU:" and added a comment, so that it's more clear that the noloop 
things aren't used in the only implemented GTU case (Arm)


Thank you :)



LGTM otherwise (but I don't have the power to approve this ;)).

Kind regards,
Andre

From: Stamatis Markianos-Wright 
Sent: Thursday, November 16, 2023 11:36 AM
To: Stamatis Markianos-Wright via Gcc-patches; Richard Earnshaw; 
Richard Sandiford; Kyrylo Tkachov
Subject: [PING][PATCH 2/2] arm: Add support for MVE Tail-Predicated 
Low Overhead Loops


Pinging back to the top of reviewers' inboxes due to worry about Stage 1
End in a few days :)


See the last email for the latest version of the 2/2 patch. The 1/2
patch is A-Ok from Kyrill's earlier target-backend review.


On 10/11/2023 12:41, Stamatis Markianos-Wright wrote:


On 06/11/2023 17:29, Stamatis Markianos-Wright wrote:


On 06/11/2023 11:24, Richard Sandiford wrote:

Stamatis Markianos-Wright  writes:
One of the main reasons for reading the arm bits was to try to 
answer

the question: if we switch to a downcounting loop with a GE
condition,
how do we make sure that the start value is not a large unsigned
number that is interpreted as negative by GE?  E.g. if the loop
originally counted up in steps of N and used an LTU condition,
it could stop at a value in the range [INT_MAX + 1, UINT_MAX].
But the loop might never iterate if we start counting down from
most values in that range.

Does the patch handle that?

So AFAICT this is actually handled in the generic code in
`doloop_valid_p`:

This kind of loops fail because of they are "desc->infinite", then no
loop-doloop conversion is attempted at all (even for standard
dls/le loops)

Thanks to that check I haven't been able to trigger anything like the
behaviour you describe, do you think the doloop_valid_p checks are
robust enough?

The loops I was thinking of are provably no

Re: [PING][PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops

2023-12-09 Thread Richard Sandiford
Sorry for the slow review.

Stamatis Markianos-Wright  writes:
> [...]
> diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
> index 
> 44a04b86cb5806fcf50917826512fd203d42106c..c083f965fa9a40781bc86beb6e63654afd14eac4
>  100644
> --- a/gcc/config/arm/mve.md
> +++ b/gcc/config/arm/mve.md
> @@ -6922,23 +6922,24 @@
>  ;; Originally expanded by 'predicated_doloop_end'.
>  ;; In the rare situation where the branch is too far, we do also need to
>  ;; revert FPSCR.LTPSIZE back to 0x100 after the last iteration.
> -(define_insn "*predicated_doloop_end_internal"
> +(define_insn "predicated_doloop_end_internal"
>[(set (pc)
>   (if_then_else
> -(ge (plus:SI (reg:SI LR_REGNUM)
> - (match_operand:SI 0 "const_int_operand" ""))
> - (const_int 0))
> -  (label_ref (match_operand 1 "" ""))
> +(gtu (unspec:SI [(plus:SI (match_operand:SI 0 "s_register_operand" 
> "=r")
> +  (const_int ))]
> + LETP)
> + (const_int ))

Is there any need for the unspec?  I couldn't see why this wasn't simply:

  (gtu (match_operand:SI 0 "s_register_operand" "=r")
   (const_int ))

But I agree that using gtu rather than ge is nicer if it's what the
instruction does.

> diff --git a/gcc/df-core.cc b/gcc/df-core.cc
> index 
> d4812b04a7cb97ea1606082e26e910472da5bcc1..4fcc14bf790d43e792b3c926fe1f80073d908c17
>  100644
> --- a/gcc/df-core.cc
> +++ b/gcc/df-core.cc
> @@ -1964,6 +1964,21 @@ df_bb_regno_last_def_find (basic_block bb, unsigned 
> int regno)
>return NULL;
>  }
>  
> +/* Return the one and only def of REGNO within BB.  If there is no def or
> +   there are multiple defs, return NULL.  */
> +
> +df_ref
> +df_bb_regno_only_def_find (basic_block bb, unsigned int regno)
> +{
> +  df_ref temp = df_bb_regno_first_def_find (bb, regno);
> +  if (!temp)
> +return NULL;
> +  else if (temp == df_bb_regno_last_def_find (bb, regno))
> +return temp;
> +  else
> +return NULL;
> +}
> +
>  /* Finds the reference corresponding to the definition of REG in INSN.
> DF is the dataflow object.  */
>  
> diff --git a/gcc/df.h b/gcc/df.h
> index 
> 402657a7076f1bcad24e9c50682e033e57f432f9..98623637f9c839c799222e99df2a7173a770b2ac
>  100644
> --- a/gcc/df.h
> +++ b/gcc/df.h
> @@ -987,6 +987,7 @@ extern void df_check_cfg_clean (void);
>  #endif
>  extern df_ref df_bb_regno_first_def_find (basic_block, unsigned int);
>  extern df_ref df_bb_regno_last_def_find (basic_block, unsigned int);
> +extern df_ref df_bb_regno_only_def_find (basic_block, unsigned int);
>  extern df_ref df_find_def (rtx_insn *, rtx);
>  extern bool df_reg_defined (rtx_insn *, rtx);
>  extern df_ref df_find_use (rtx_insn *, rtx);
> diff --git a/gcc/loop-doloop.cc b/gcc/loop-doloop.cc
> index 
> 4feb0a25ab9331b7124df900f73c9fc6fb3eb10b..d919207505c472c8a54a2c9c982a09061584177b
>  100644
> --- a/gcc/loop-doloop.cc
> +++ b/gcc/loop-doloop.cc
> @@ -85,10 +85,10 @@ doloop_condition_get (rtx_insn *doloop_pat)
>   forms:
>  
>   1)  (parallel [(set (pc) (if_then_else (condition)
> - (label_ref (label))
> - (pc)))
> -  (set (reg) (plus (reg) (const_int -1)))
> -  (additional clobbers and uses)])
> + (label_ref (label))
> + (pc)))
> +  (set (reg) (plus (reg) (const_int -1)))
> +  (additional clobbers and uses)])
>  
>   The branch must be the first entry of the parallel (also required
>   by jump.cc), and the second entry of the parallel must be a set of
> @@ -96,19 +96,34 @@ doloop_condition_get (rtx_insn *doloop_pat)
>   the loop counter in an if_then_else too.
>  
>   2)  (set (reg) (plus (reg) (const_int -1))
> - (set (pc) (if_then_else (reg != 0)
> -  (label_ref (label))
> -  (pc))).  
> +  (set (pc) (if_then_else (reg != 0)
> +  (label_ref (label))
> +  (pc))).
>  
>   Some targets (ARM) do the comparison before the branch, as in the
>   following form:
>  
> - 3) (parallel [(set (cc) (compare ((plus (reg) (const_int -1), 0)))
> -   (set (reg) (plus (reg) (const_int -1)))])
> -(set (pc) (if_then_else (cc == NE)
> -(label_ref (label))
> -(pc))) */
> -
> + 3) (parallel [(set (cc) (compare (plus (reg) (const_int -1)) 0))
> +(set (reg) (plus (reg) (const_int -1)))])
> + (set (pc) (if_then_else (cc == NE)
> + (label_ref (label))
> + (pc)))
> +
> +  The ARM target also supports a special case of a counter that 
> decrements
> +  by `n` and terminating in a GTU condition.  In that case, the compare 
> and
> +  

Re: [PING][PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops

2023-12-07 Thread Andre Vieira (lists)
Thanks for addressing my comments. I have reviewed this and the other 
patch before and they LGTM. I however do not have approval rights so you 
will need the OK from a maintainer.


Thanks for doing this :)

Andre

On 30/11/2023 12:55, Stamatis Markianos-Wright wrote:

Hi Andre,

Thanks for the comments, see latest revision attached.

On 27/11/2023 12:47, Andre Vieira (lists) wrote:

Hi Stam,

Just some comments.

+/* Recursively scan through the DF chain backwards within the basic 
block and
+   determine if any of the USEs of the original insn (or the USEs of 
the insns
s/Recursively scan/Scan/ as you no longer recurse, thanks for that by 
the way :) +   where thy were DEF-ed, etc., recursively) were affected 
by implicit VPT

remove recursively for the same reasons.

+  if (!CONST_INT_P (cond_counter_iv.step) || !CONST_INT_P 
(cond_temp_iv.step))

+    return NULL;
+  /* Look at the steps and swap around the rtx's if needed. Error 
out if

+ one of them cannot be identified as constant.  */
+  if (INTVAL (cond_counter_iv.step) != 0 && INTVAL 
(cond_temp_iv.step) != 0)

+    return NULL;

Move the comment above the if before, as the erroring out it talks 
about is there.

Done


+  emit_note_after ((enum insn_note)NOTE_KIND (insn), BB_END (body));
 space after 'insn_note)'

@@ -173,14 +176,14 @@ doloop_condition_get (rtx_insn *doloop_pat)
   if (! REG_P (reg))
 return 0;
 -  /* Check if something = (plus (reg) (const_int -1)).
+  /* Check if something = (plus (reg) (const_int -n)).
  On IA-64, this decrement is wrapped in an if_then_else.  */
   inc_src = SET_SRC (inc);
   if (GET_CODE (inc_src) == IF_THEN_ELSE)
 inc_src = XEXP (inc_src, 1);
   if (GET_CODE (inc_src) != PLUS
   || XEXP (inc_src, 0) != reg
-  || XEXP (inc_src, 1) != constm1_rtx)
+  || !CONST_INT_P (XEXP (inc_src, 1)))

Do we ever check that inc_src is negative? We used to check if it was 
-1, now we only check it's a constnat, but not a negative one, so I 
suspect this needs a:

|| INTVAL (XEXP (inc_src, 1)) >= 0

Good point. Done


@@ -492,7 +519,8 @@ doloop_modify (class loop *loop, class niter_desc 
*desc,

 case GE:
   /* Currently only GE tests against zero are supported.  */
   gcc_assert (XEXP (condition, 1) == const0_rtx);
-
+  /* FALLTHRU */
+    case GTU:
   noloop = constm1_rtx;

I spent a very long time staring at this trying to understand why 
noloop = constm1_rtx for GTU, where I thought it should've been (count 
& (n-1)). For the current use of doloop it doesn't matter because ARM 
is the only target using it and you set desc->noloop_assumptions to 
null_rtx in 'arm_attempt_dlstp_transform' so noloop is never used. 
However, if a different target accepts this GTU pattern then this 
target agnostic code will do the wrong thing.  I suggest we either:
 - set noloop to what we think might be the correct value, which if 
you ask me should be 'count & (XEXP (condition, 1))',
 - or add a gcc_assert (GET_CODE (condition) != GTU); under the if 
(desc->noloop_assumption); part and document why.  I have a slight 
preference for the assert given otherwise we are adding code that we 
can't test.


Yea, that's true tbh. I've done the latter, but also separated out the 
"case GTU:" and added a comment, so that it's more clear that the noloop 
things aren't used in the only implemented GTU case (Arm)


Thank you :)



LGTM otherwise (but I don't have the power to approve this ;)).

Kind regards,
Andre

From: Stamatis Markianos-Wright 
Sent: Thursday, November 16, 2023 11:36 AM
To: Stamatis Markianos-Wright via Gcc-patches; Richard Earnshaw; 
Richard Sandiford; Kyrylo Tkachov
Subject: [PING][PATCH 2/2] arm: Add support for MVE Tail-Predicated 
Low Overhead Loops


Pinging back to the top of reviewers' inboxes due to worry about Stage 1
End in a few days :)


See the last email for the latest version of the 2/2 patch. The 1/2
patch is A-Ok from Kyrill's earlier target-backend review.


On 10/11/2023 12:41, Stamatis Markianos-Wright wrote:


On 06/11/2023 17:29, Stamatis Markianos-Wright wrote:


On 06/11/2023 11:24, Richard Sandiford wrote:

Stamatis Markianos-Wright  writes:
One of the main reasons for reading the arm bits was to try to 
answer

the question: if we switch to a downcounting loop with a GE
condition,
how do we make sure that the start value is not a large unsigned
number that is interpreted as negative by GE?  E.g. if the loop
originally counted up in steps of N and used an LTU condition,
it could stop at a value in the range [INT_MAX + 1, UINT_MAX].
But the loop might never iterate if we start counting down from
most values in that range.

Does the patch handle that?

So AFAICT this is actually handled in the generic code in
`doloop_valid_p`:

This kind of loops fail because of they are "desc->infinite", then no
loop-doloop conversion is attempted at all (even for standa

Re: [PING][PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops

2023-11-27 Thread Andre Vieira (lists)

Hi Stam,

Just some comments.

+/* Recursively scan through the DF chain backwards within the basic 
block and
+   determine if any of the USEs of the original insn (or the USEs of 
the insns
s/Recursively scan/Scan/ as you no longer recurse, thanks for that by 
the way :) +   where thy were DEF-ed, etc., recursively) were affected 
by implicit VPT

remove recursively for the same reasons.

+  if (!CONST_INT_P (cond_counter_iv.step) || !CONST_INT_P 
(cond_temp_iv.step))

+   return NULL;
+  /* Look at the steps and swap around the rtx's if needed.  Error 
out if

+one of them cannot be identified as constant.  */
+  if (INTVAL (cond_counter_iv.step) != 0 && INTVAL 
(cond_temp_iv.step) != 0)

+   return NULL;

Move the comment above the if before, as the erroring out it talks about 
is there.


+  emit_note_after ((enum insn_note)NOTE_KIND (insn), BB_END (body));
 space after 'insn_note)'

@@ -173,14 +176,14 @@ doloop_condition_get (rtx_insn *doloop_pat)
   if (! REG_P (reg))
 return 0;
 -  /* Check if something = (plus (reg) (const_int -1)).
+  /* Check if something = (plus (reg) (const_int -n)).
  On IA-64, this decrement is wrapped in an if_then_else.  */
   inc_src = SET_SRC (inc);
   if (GET_CODE (inc_src) == IF_THEN_ELSE)
 inc_src = XEXP (inc_src, 1);
   if (GET_CODE (inc_src) != PLUS
   || XEXP (inc_src, 0) != reg
-  || XEXP (inc_src, 1) != constm1_rtx)
+  || !CONST_INT_P (XEXP (inc_src, 1)))

Do we ever check that inc_src is negative? We used to check if it was 
-1, now we only check it's a constnat, but not a negative one, so I 
suspect this needs a:

|| INTVAL (XEXP (inc_src, 1)) >= 0

@@ -492,7 +519,8 @@ doloop_modify (class loop *loop, class niter_desc *desc,
 case GE:
   /* Currently only GE tests against zero are supported.  */
   gcc_assert (XEXP (condition, 1) == const0_rtx);
-
+  /* FALLTHRU */
+case GTU:
   noloop = constm1_rtx;

I spent a very long time staring at this trying to understand why noloop 
= constm1_rtx for GTU, where I thought it should've been (count & 
(n-1)). For the current use of doloop it doesn't matter because ARM is 
the only target using it and you set desc->noloop_assumptions to 
null_rtx in 'arm_attempt_dlstp_transform' so noloop is never used. 
However, if a different target accepts this GTU pattern then this target 
agnostic code will do the wrong thing.  I suggest we either:
 - set noloop to what we think might be the correct value, which if you 
ask me should be 'count & (XEXP (condition, 1))',
 - or add a gcc_assert (GET_CODE (condition) != GTU); under the if 
(desc->noloop_assumption); part and document why.  I have a slight 
preference for the assert given otherwise we are adding code that we 
can't test.


LGTM otherwise (but I don't have the power to approve this ;)).

Kind regards,
Andre

From: Stamatis Markianos-Wright 
Sent: Thursday, November 16, 2023 11:36 AM
To: Stamatis Markianos-Wright via Gcc-patches; Richard Earnshaw; Richard 
Sandiford; Kyrylo Tkachov
Subject: [PING][PATCH 2/2] arm: Add support for MVE Tail-Predicated Low 
Overhead Loops


Pinging back to the top of reviewers' inboxes due to worry about Stage 1
End in a few days :)


See the last email for the latest version of the 2/2 patch. The 1/2
patch is A-Ok from Kyrill's earlier target-backend review.


On 10/11/2023 12:41, Stamatis Markianos-Wright wrote:


On 06/11/2023 17:29, Stamatis Markianos-Wright wrote:


On 06/11/2023 11:24, Richard Sandiford wrote:

Stamatis Markianos-Wright  writes:

One of the main reasons for reading the arm bits was to try to answer
the question: if we switch to a downcounting loop with a GE
condition,
how do we make sure that the start value is not a large unsigned
number that is interpreted as negative by GE?  E.g. if the loop
originally counted up in steps of N and used an LTU condition,
it could stop at a value in the range [INT_MAX + 1, UINT_MAX].
But the loop might never iterate if we start counting down from
most values in that range.

Does the patch handle that?

So AFAICT this is actually handled in the generic code in
`doloop_valid_p`:

This kind of loops fail because of they are "desc->infinite", then no
loop-doloop conversion is attempted at all (even for standard
dls/le loops)

Thanks to that check I haven't been able to trigger anything like the
behaviour you describe, do you think the doloop_valid_p checks are
robust enough?

The loops I was thinking of are provably not infinite though. E.g.:

   for (unsigned int i = 0; i < UINT_MAX - 100; ++i)
 ...

is known to terminate.  And doloop conversion is safe with the normal
count-down-by-1 approach, so I don't think current code would need
to reject it.  I.e. a conversion to:

   unsigned int i = UINT_MAX - 101;
   do
 ...
   while (--i != ~0U);

would be safe, but a conversion to:

   int i = UINT_MAX - 101;
   do
 ...

[PING][PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops

2023-11-16 Thread Stamatis Markianos-Wright
Pinging back to the top of reviewers' inboxes due to worry about Stage 1 
End in a few days :)



See the last email for the latest version of the 2/2 patch. The 1/2 
patch is A-Ok from Kyrill's earlier target-backend review.



On 10/11/2023 12:41, Stamatis Markianos-Wright wrote:


On 06/11/2023 17:29, Stamatis Markianos-Wright wrote:


On 06/11/2023 11:24, Richard Sandiford wrote:

Stamatis Markianos-Wright  writes:

One of the main reasons for reading the arm bits was to try to answer
the question: if we switch to a downcounting loop with a GE 
condition,

how do we make sure that the start value is not a large unsigned
number that is interpreted as negative by GE?  E.g. if the loop
originally counted up in steps of N and used an LTU condition,
it could stop at a value in the range [INT_MAX + 1, UINT_MAX].
But the loop might never iterate if we start counting down from
most values in that range.

Does the patch handle that?
So AFAICT this is actually handled in the generic code in 
`doloop_valid_p`:


This kind of loops fail because of they are "desc->infinite", then no
loop-doloop conversion is attempted at all (even for standard 
dls/le loops)


Thanks to that check I haven't been able to trigger anything like the
behaviour you describe, do you think the doloop_valid_p checks are
robust enough?

The loops I was thinking of are provably not infinite though. E.g.:

   for (unsigned int i = 0; i < UINT_MAX - 100; ++i)
 ...

is known to terminate.  And doloop conversion is safe with the normal
count-down-by-1 approach, so I don't think current code would need
to reject it.  I.e. a conversion to:

   unsigned int i = UINT_MAX - 101;
   do
 ...
   while (--i != ~0U);

would be safe, but a conversion to:

   int i = UINT_MAX - 101;
   do
 ...
   while ((i -= step, i > 0));

wouldn't, because the loop body would only be executed once.

I'm only going off the name "infinite" though :)  It's possible that
it has more connotations than that.

Thanks,
Richard


Ack, yep, I see what you mean now, and yep, that kind of loop does 
indeed pass through doloop_valid_p


Interestingly , in the v8-M Arm ARM this is done with:

```

boolean IsLastLowOverheadLoop(INSTR_EXEC_STATE_Type state)
// This does not check whether a loop is currently active.
// If the PE were in a loop, would this be the last one?
return UInt(state.LoopCount) <= (1 << (4 - LTPSIZE));

```

So architecturally the asm we output would be ok (except maybe the 
"branch too far subs;bgt;lctp" fallback at 
`predicated_doloop_end_internal` (maybe that should be `bhi`))... But 
now GE: isn't looking like an accurate representation of this 
operation in the compiler.


I'm wondering if I should try to make 
`predicated_doloop_end_internal` contain a comparison along the lines 
of:

(gtu: (plus: (LR) (const_int -num_lanes)) (const_int num_lanes_minus_1))

I'll give that a try :)

The only reason I'd chosen to go with GE earlier, tbh, was because of 
the existing handling of GE in loop-doloop.cc


Let me know if any other ideas come to your mind!


Cheers,

Stam



It looks like I've had success with the below (diff to previous patch),
trimmed a bit to only the functionally interesting things::




diff --git a/gcc/config/arm/thumb2.md b/gcc/config/arm/thumb2.md
index 368d5138ca1..54dd4ee564b 100644
--- a/gcc/config/arm/thumb2.md
+++ b/gcc/config/arm/thumb2.md
@@ -1649,16 +1649,28 @@
   && (decrement_num = arm_attempt_dlstp_transform (operands[1]))
   && (INTVAL (decrement_num) != 1))
     {
-      insn = emit_insn
-          (gen_thumb2_addsi3_compare0
-              (s0, s0, GEN_INT ((-1) * (INTVAL (decrement_num);
-      cmp = XVECEXP (PATTERN (insn), 0, 0);
-      cc_reg = SET_DEST (cmp);
-      bcomp = gen_rtx_GE (VOIDmode, cc_reg, const0_rtx);
   loc_ref = gen_rtx_LABEL_REF (VOIDmode, operands[1]);
-      emit_jump_insn (gen_rtx_SET (pc_rtx,
-                   gen_rtx_IF_THEN_ELSE (VOIDmode, bcomp,
-                             loc_ref, pc_rtx)));
+      switch (INTVAL (decrement_num))
+        {
+          case 2:
+            insn = emit_jump_insn (gen_predicated_doloop_end_internal2
+                        (s0, loc_ref));
+            break;
+          case 4:
+            insn = emit_jump_insn (gen_predicated_doloop_end_internal4
+                        (s0, loc_ref));
+            break;
+          case 8:
+            insn = emit_jump_insn (gen_predicated_doloop_end_internal8
+                        (s0, loc_ref));
+            break;
+          case 16:
+            insn = emit_jump_insn (gen_predicated_doloop_end_internal16
+                        (s0, loc_ref));
+            break;
+          default:
+            gcc_unreachable ();
+        }
   DONE;
     }
 }

diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index 93905583b18..c083f965fa9 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -6922,23 +6922,24 @@
 ;; 

Re: [PING][PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops

2023-11-06 Thread Stamatis Markianos-Wright



On 06/11/2023 11:24, Richard Sandiford wrote:

Stamatis Markianos-Wright  writes:

One of the main reasons for reading the arm bits was to try to answer
the question: if we switch to a downcounting loop with a GE condition,
how do we make sure that the start value is not a large unsigned
number that is interpreted as negative by GE?  E.g. if the loop
originally counted up in steps of N and used an LTU condition,
it could stop at a value in the range [INT_MAX + 1, UINT_MAX].
But the loop might never iterate if we start counting down from
most values in that range.

Does the patch handle that?

So AFAICT this is actually handled in the generic code in `doloop_valid_p`:

This kind of loops fail because of they are "desc->infinite", then no
loop-doloop conversion is attempted at all (even for standard dls/le loops)

Thanks to that check I haven't been able to trigger anything like the
behaviour you describe, do you think the doloop_valid_p checks are
robust enough?

The loops I was thinking of are provably not infinite though.  E.g.:

   for (unsigned int i = 0; i < UINT_MAX - 100; ++i)
 ...

is known to terminate.  And doloop conversion is safe with the normal
count-down-by-1 approach, so I don't think current code would need
to reject it.  I.e. a conversion to:

   unsigned int i = UINT_MAX - 101;
   do
 ...
   while (--i != ~0U);

would be safe, but a conversion to:

   int i = UINT_MAX - 101;
   do
 ...
   while ((i -= step, i > 0));

wouldn't, because the loop body would only be executed once.

I'm only going off the name "infinite" though :)  It's possible that
it has more connotations than that.

Thanks,
Richard


Ack, yep, I see what you mean now, and yep, that kind of loop does 
indeed pass through doloop_valid_p


Interestingly , in the v8-M Arm ARM this is done with:

```

boolean IsLastLowOverheadLoop(INSTR_EXEC_STATE_Type state)
// This does not check whether a loop is currently active.
// If the PE were in a loop, would this be the last one?
return UInt(state.LoopCount) <= (1 << (4 - LTPSIZE));

```

So architecturally the asm we output would be ok (except maybe the 
"branch too far subs;bgt;lctp" fallback at 
`predicated_doloop_end_internal` (maybe that should be `bhi`))... But 
now GE: isn't looking like an accurate representation of this operation 
in the compiler.


I'm wondering if I should try to make `predicated_doloop_end_internal` 
contain a comparison along the lines of:

(gtu: (plus: (LR) (const_int -num_lanes)) (const_int num_lanes_minus_1))

I'll give that a try :)

The only reason I'd chosen to go with GE earlier, tbh, was because of 
the existing handling of GE in loop-doloop.cc


Let me know if any other ideas come to your mind!


Cheers,

Stam




Re: [PING][PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops

2023-11-06 Thread Richard Sandiford
Stamatis Markianos-Wright  writes:
>> One of the main reasons for reading the arm bits was to try to answer
>> the question: if we switch to a downcounting loop with a GE condition,
>> how do we make sure that the start value is not a large unsigned
>> number that is interpreted as negative by GE?  E.g. if the loop
>> originally counted up in steps of N and used an LTU condition,
>> it could stop at a value in the range [INT_MAX + 1, UINT_MAX].
>> But the loop might never iterate if we start counting down from
>> most values in that range.
>>
>> Does the patch handle that?
>
> So AFAICT this is actually handled in the generic code in `doloop_valid_p`:
>
> This kind of loops fail because of they are "desc->infinite", then no 
> loop-doloop conversion is attempted at all (even for standard dls/le loops)
>
> Thanks to that check I haven't been able to trigger anything like the 
> behaviour you describe, do you think the doloop_valid_p checks are 
> robust enough?

The loops I was thinking of are provably not infinite though.  E.g.:

  for (unsigned int i = 0; i < UINT_MAX - 100; ++i)
...

is known to terminate.  And doloop conversion is safe with the normal
count-down-by-1 approach, so I don't think current code would need
to reject it.  I.e. a conversion to:

  unsigned int i = UINT_MAX - 101;
  do
...
  while (--i != ~0U);

would be safe, but a conversion to:

  int i = UINT_MAX - 101;
  do
...
  while ((i -= step, i > 0));

wouldn't, because the loop body would only be executed once.

I'm only going off the name "infinite" though :)  It's possible that
it has more connotations than that.

Thanks,
Richard


Re: [PING][PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops

2023-10-24 Thread Richard Sandiford
Sorry for the slow review.  I had a look at the arm bits too, to get
some context for the target-independent bits.

Stamatis Markianos-Wright via Gcc-patches  writes:
> [...]
> diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
> index 77e76336e94..74186930f0b 100644
> --- a/gcc/config/arm/arm-protos.h
> +++ b/gcc/config/arm/arm-protos.h
> @@ -65,8 +65,8 @@ extern void arm_emit_speculation_barrier_function (void);
>  extern void arm_decompose_di_binop (rtx, rtx, rtx *, rtx *, rtx *, rtx *);
>  extern bool arm_q_bit_access (void);
>  extern bool arm_ge_bits_access (void);
> -extern bool arm_target_insn_ok_for_lob (rtx);
> -
> +extern bool arm_target_bb_ok_for_lob (basic_block);
> +extern rtx arm_attempt_dlstp_transform (rtx);
>  #ifdef RTX_CODE
>  enum reg_class
>  arm_mode_base_reg_class (machine_mode);
> diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc
> index 6e933c80183..39d97ba5e4d 100644
> --- a/gcc/config/arm/arm.cc
> +++ b/gcc/config/arm/arm.cc
> @@ -659,6 +659,12 @@ static const struct attribute_spec arm_attribute_table[]
> [...]
> +/* Wrapper function of arm_get_required_vpr_reg with TYPE == 1, so return
> +   something only if the VPR reg is an input operand to the insn.  */
> +
> +static rtx
> +ALWAYS_INLINE

Probably best to leave out the ALWAYS_INLINE.  That's generally only
appropriate for things that need to be inlined for correctness.

> +arm_get_required_vpr_reg_param (rtx_insn *insn)
> +{
> +  return arm_get_required_vpr_reg (insn, 1);
> +}
> [...]
> +/* Recursively scan through the DF chain backwards within the basic block and
> +   determine if any of the USEs of the original insn (or the USEs of the 
> insns
> +   where thy were DEF-ed, etc., recursively) were affected by implicit VPT
> +   predication of an MVE_VPT_UNPREDICATED_INSN_P in a dlstp/letp loop.
> +   This function returns true if the insn is affected implicit predication
> +   and false otherwise.
> +   Having such implicit predication on an unpredicated insn wouldn't in 
> itself
> +   block tail predication, because the output of that insn might then be used
> +   in a correctly predicated store insn, where the disabled lanes will be
> +   ignored.  To verify this we later call:
> +   `arm_mve_check_df_chain_fwd_for_implic_predic_impact`, which will check 
> the
> +   DF chains forward to see if any implicitly-predicated operand gets used in
> +   an improper way.  */
> +
> +static bool
> +arm_mve_check_df_chain_back_for_implic_predic
> +  (hash_map, bool>* safe_insn_map, rtx_insn *insn,
> +   rtx vctp_vpr_generated)
> +{
> +  bool* temp = NULL;
> +  if ((temp = safe_insn_map->get (INSN_UID (insn
> +return *temp;
> +
> +  basic_block body = BLOCK_FOR_INSN (insn);
> +  /* The circumstances under which an instruction is affected by "implicit
> + predication" are as follows:
> +  * It is an UNPREDICATED_INSN_P:
> + * That loads/stores from/to memory.
> + * Where any one of its operands is an MVE vector from outside the
> +   loop body bb.
> + Or:
> +  * Any of it's operands, recursively backwards, are affected.  */
> +  if (MVE_VPT_UNPREDICATED_INSN_P (insn)
> +  && (arm_is_mve_load_store_insn (insn)
> +   || (arm_is_mve_across_vector_insn (insn)
> +   && !arm_mve_is_allowed_unpredic_across_vector_insn (insn
> +{
> +  safe_insn_map->put (INSN_UID (insn), true);
> +  return true;
> +}
> +
> +  df_ref insn_uses = NULL;
> +  FOR_EACH_INSN_USE (insn_uses, insn)
> +  {
> +/* If the operand is in the input reg set to the the basic block,
> +   (i.e. it has come from outside the loop!), consider it unsafe if:
> +  * It's being used in an unpredicated insn.
> +  * It is a predicable MVE vector.  */
> +if (MVE_VPT_UNPREDICATED_INSN_P (insn)
> + && VALID_MVE_MODE (GET_MODE (DF_REF_REG (insn_uses)))
> + && REGNO_REG_SET_P (DF_LR_IN (body), DF_REF_REGNO (insn_uses)))
> +  {
> + safe_insn_map->put (INSN_UID (insn), true);
> + return true;
> +  }
> +/* Scan backwards from the current INSN through the instruction chain
> +   until the start of the basic block.  */
> +for (rtx_insn *prev_insn = PREV_INSN (insn);
> +  prev_insn && prev_insn != PREV_INSN (BB_HEAD (body));
> +  prev_insn = PREV_INSN (prev_insn))
> +  {
> + /* If a previous insn defines a register that INSN uses, then recurse
> +in order to check that insn's USEs.
> +If any of these insns return true as MVE_VPT_UNPREDICATED_INSN_Ps,
> +then the whole chain is affected by the change in behaviour from
> +being placed in dlstp/letp loop.  */
> + df_ref prev_insn_defs = NULL;
> + FOR_EACH_INSN_DEF (prev_insn_defs, prev_insn)
> + {
> +   if (DF_REF_REGNO (insn_uses) == DF_REF_REGNO (prev_insn_defs)
> +   && !arm_mve_vec_insn_is_predicated_with_this_predicate
> +(insn, vctp_vpr_generated)
> +   && 

Re: [PING][PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops

2023-10-23 Thread Andre Vieira (lists)
Ping for Jeff or another global maintainer to review the target agnostic 
bits of this, that's:

loop-doloop.cc
df-core.{c,h}

I do have a nitpick myself that I missed last time around:
  /* We expect the condition to be of the form (reg != 0)  */
  cond = XEXP (SET_SRC (cmp), 0);
- if (GET_CODE (cond) != NE || XEXP (cond, 1) != const0_rtx)
+ if ((GET_CODE (cond) != NE && GET_CODE (cond) != GE)
+ || XEXP (cond, 1) != const0_rtx)
return 0;
}
Could do with updating the comment to reflect allowing >= now. But happy 
for you to change this once approved by a maintainer.


Kind regards,
Andre

On 11/10/2023 12:34, Stamatis Markianos-Wright wrote:

Hi all,

On 28/09/2023 13:51, Andre Vieira (lists) wrote:

Hi,

On 14/09/2023 13:10, Kyrylo Tkachov via Gcc-patches wrote:

Hi Stam,





The arm parts look sensible but we'd need review for the df-core.h 
and df-core.cc changes.

Maybe Jeff can help or can recommend someone to take a look?


Just thought I'd do a follow-up "ping" on this :)



Thanks,
Kyrill



FWIW the changes LGTM, if we don't want these in df-core we can always 
implement the extra utility locally. It's really just a helper 
function to check if df_bb_regno_first_def_find and 
df_bb_regno_last_def_find yield the same result, meaning we only have 
a single definition.


Kind regards,
Andre


Thanks,

Stam



Re: [PING][PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops

2023-10-11 Thread Stamatis Markianos-Wright

Hi all,

On 28/09/2023 13:51, Andre Vieira (lists) wrote:

Hi,

On 14/09/2023 13:10, Kyrylo Tkachov via Gcc-patches wrote:

Hi Stam,





The arm parts look sensible but we'd need review for the df-core.h 
and df-core.cc changes.

Maybe Jeff can help or can recommend someone to take a look?


Just thought I'd do a follow-up "ping" on this :)



Thanks,
Kyrill



FWIW the changes LGTM, if we don't want these in df-core we can always 
implement the extra utility locally. It's really just a helper 
function to check if df_bb_regno_first_def_find and 
df_bb_regno_last_def_find yield the same result, meaning we only have 
a single definition.


Kind regards,
Andre


Thanks,

Stam



Re: [PING][PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops

2023-09-28 Thread Andre Vieira (lists)

Hi,

On 14/09/2023 13:10, Kyrylo Tkachov via Gcc-patches wrote:

Hi Stam,





The arm parts look sensible but we'd need review for the df-core.h and 
df-core.cc changes.
Maybe Jeff can help or can recommend someone to take a look?
Thanks,
Kyrill



FWIW the changes LGTM, if we don't want these in df-core we can always 
implement the extra utility locally. It's really just a helper function 
to check if df_bb_regno_first_def_find and df_bb_regno_last_def_find 
yield the same result, meaning we only have a single definition.


Kind regards,
Andre


RE: [PING][PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops

2023-09-14 Thread Kyrylo Tkachov via Gcc-patches
Hi Stam,

> -Original Message-
> From: Stam Markianos-Wright 
> Sent: Wednesday, September 6, 2023 6:19 PM
> To: gcc-patches@gcc.gnu.org
> Cc: Kyrylo Tkachov ; Richard Earnshaw
> 
> Subject: [PING][PATCH 2/2] arm: Add support for MVE Tail-Predicated Low
> Overhead Loops
> 
> Hi all,
> 
> This is the 2/2 patch that contains the functional changes needed
> for MVE Tail Predicated Low Overhead Loops.  See my previous email
> for a general introduction of MVE LOLs.
> 
> This support is added through the already existing loop-doloop
> mechanisms that are used for non-MVE dls/le looping.
> 
> Mid-end changes are:
> 
> 1) Relax the loop-doloop mechanism in the mid-end to allow for
>     decrement numbers other that -1 and for `count` to be an
>     rtx containing a simple REG (which in this case will contain
>     the number of elements to be processed), rather
>     than an expression for calculating the number of iterations.
> 2) Added a new df utility function: `df_bb_regno_only_def_find` that
>     will return the DEF of a REG if it is DEF-ed only once within the
>     basic block.
> 
> And many things in the backend to implement the above optimisation:
> 
> 3)  Implement the `arm_predict_doloop_p` target hook to instruct the
>      mid-end about Low Overhead Loops (MVE or not), as well as
>      `arm_loop_unroll_adjust` which will prevent unrolling of any loops
>      that are valid for becoming MVE Tail_Predicated Low Overhead Loops
>      (unrolling can transform a loop in ways that invalidate the dlstp/
>      letp tranformation logic and the benefit of the dlstp/letp loop
>      would be considerably higher than that of unrolling)
> 4)  Appropriate changes to the define_expand of doloop_end, new
>      patterns for dlstp and letp, new iterators,  unspecs, etc.
> 5) `arm_mve_loop_valid_for_dlstp` and a number of checking functions:
>     * `arm_mve_dlstp_check_dec_counter`
>     * `arm_mve_dlstp_check_inc_counter`
>     * `arm_mve_check_reg_origin_is_num_elems`
>     * `arm_mve_check_df_chain_back_for_implic_predic`
>     * `arm_mve_check_df_chain_fwd_for_implic_predic_impact`
>     This all, in smoe way or another, are running checks on the loop
>     structure in order to determine if the loop is valid for dlstp/letp
>     transformation.
> 6) `arm_attempt_dlstp_transform`: (called from the define_expand of
>      doloop_end) this function re-checks for the loop's suitability for
>      dlstp/letp transformation and then implements it, if possible.
> 7) Various utility functions:
>     *`arm_mve_get_vctp_lanes` to map
>     from vctp unspecs to number of lanes, and `arm_get_required_vpr_reg`
>     to check an insn to see if it requires the VPR or not.
>     * `arm_mve_get_loop_vctp`
>     * `arm_mve_get_vctp_lanes`
>     * `arm_emit_mve_unpredicated_insn_to_seq`
>     * `arm_get_required_vpr_reg`
>     * `arm_get_required_vpr_reg_param`
>     * `arm_get_required_vpr_reg_ret_val`
>     * `arm_mve_is_across_vector_insn`
>     * `arm_is_mve_load_store_insn`
>     * `arm_mve_vec_insn_is_predicated_with_this_predicate`
>     * `arm_mve_vec_insn_is_unpredicated_or_uses_other_predicate`
> 
> No regressions on arm-none-eabi with various targets and on
> aarch64-none-elf. Thoughts on getting this into trunk?

The arm parts look sensible but we'd need review for the df-core.h and 
df-core.cc changes.
Maybe Jeff can help or can recommend someone to take a look?
Thanks,
Kyrill

> 
> Thank you,
> Stam Markianos-Wright
> 
> gcc/ChangeLog:
> 
>      * config/arm/arm-protos.h (arm_target_insn_ok_for_lob): Rename to...
>      (arm_target_bb_ok_for_lob): ...this
>      (arm_attempt_dlstp_transform): New.
>      * config/arm/arm.cc (TARGET_LOOP_UNROLL_ADJUST): New.
>      (TARGET_PREDICT_DOLOOP_P): New.
>      (arm_block_set_vect):
>      (arm_target_insn_ok_for_lob): Rename from arm_target_insn_ok_for_lob.
>      (arm_target_bb_ok_for_lob): New.
>      (arm_mve_get_vctp_lanes): New.
>      (arm_get_required_vpr_reg): New.
>      (arm_get_required_vpr_reg_param): New.
>      (arm_get_required_vpr_reg_ret_val): New.
>      (arm_mve_get_loop_vctp): New.
>      (arm_mve_vec_insn_is_unpredicated_or_uses_other_predicate): New.
>      (arm_mve_vec_insn_is_predicated_with_this_predicate): New.
>      (arm_mve_check_df_chain_back_for_implic_predic): New.
>      (arm_mve_check_df_chain_fwd_for_implic_predic_impact): New.
>      (arm_mve_check_reg_origin_is_num_elems): New.
>      (arm_mve_dlstp_check_inc_counter): New.
>      (arm_mve_dlstp_check_dec_counter): New.
>      (arm_mve_loop_valid_for_dlstp): New.
>      (arm_mve_is_across_vector_insn): New.
>      (arm_is_mve_load_store_insn): New.
>      (arm_predict_doloop_p): Ne

Re: [PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops

2023-07-05 Thread Stamatis Markianos-Wright via Gcc-patches



On 23/06/2023 11:23, Andre Vieira (lists) wrote:

+  if (insn != arm_mve_get_loop_vctp (body))
+    {

probably a good idea to invert the condition here and return false, 
helps reducing the indenting in this function.


Done, thanks





+    /* Starting from the current insn, scan backwards through the insn
+   chain until BB_HEAD: "for each insn in the BB prior to the 
current".

+    */

There's a trailing whitespace after insn, but also I'd rewrite this 
bit. The "for each insn in the BB prior to the current" is superfluous 
and even confusing to me. How about:
"Scan backwards from the current INSN through the instruction chain 
until the start of the basic block.  "

Yes, agreed, it wasn't very clear. Done.



 I find 'that previous insn' to be confusing as you don't mention any 
previous insn before. So how about something along the lines of:
'If a previous insn defines a register that INSN uses then return true 
if...'

Done



Do we need to check: 'insn != prev_insn' ? Any reason why you can't 
start the loop with:

'for (rtx_insn *prev_insn = PREV_INSN (insn);'


True! Done.



Now I also found a case where things might go wrong in:
+    /* Look at all the DEFs of that previous insn: if one of them 
is on
+   the same REG as our current insn, then recurse in order to 
check

+   that insn's USEs.  If any of these insns return true as
+   MVE_VPT_UNPREDICATED_INSN_Ps, then the whole chain is 
affected
+   by the change in behaviour from being placed in dlstp/letp 
loop.

+    */
+    df_ref prev_insn_defs = NULL;
+    FOR_EACH_INSN_DEF (prev_insn_defs, prev_insn)
+  {
+    if (DF_REF_REGNO (insn_uses) == DF_REF_REGNO (prev_insn_defs)
+    && insn != prev_insn
+    && body == BLOCK_FOR_INSN (prev_insn)
+    && !arm_mve_vec_insn_is_predicated_with_this_predicate
+ (insn, vctp_vpr_generated)
+    && arm_mve_check_df_chain_back_for_implic_predic
+ (prev_insn, vctp_vpr_generated))
+  return true;
+  }

The body == BLOCK_FOR_INSN (prev_insn) hinted me at it, if a def comes 
from outside of the BB (so outside of the loop's body) then its by 
definition unpredicated by vctp.  I think you want to check that if 
prev_insn defines a register used by insn then return true if 
prev_insn isn't in the same BB or has a chain that is not predicated, 
i.e.: '!arm_mve_vec_insn_is_predicated_with_this_predicate (insn, 
vctp_vpr_generated) && arm_mve_check_df_chain_back_for_implic_predic 
prev_insn, vctp_vpr_generated))' you check body != BLOCK_FOR_INSN 
(prev_insn)'


Yes, you're right, this is vulnerable here. A neater fix to this (I 
think?) is to make the above REGNO_REG_SET_P more generic, so that it 
covers all scalar values and scalar ops, as well.
Then it's a "if this insn in the loop has any input that originates 
outside the bb, then it's unsafe" check and the recursive loop backwards 
is only for the recursive "are any previous insns unsafe"






I also found some other issues, this currently loloops:

uint16_t  test (uint16_t *a, int n)
{
  uint16_t res =0;
  while (n > 0)
    {
  mve_pred16_t p = vctp16q (n);
  uint16x8_t va = vldrhq_u16 (a);
  res = vaddvaq_u16 (res, va);
  res = vaddvaq_p_u16 (res, va, p);
  a += 8;
  n -= 8;
    }
  return res;
}

But it shouldn't, this is because there's a lack of handling of across 
vector instructions. Luckily in MVE all across vector instructions 
have the side-effect that they write to a scalar register, even the 
vshlcq instruction (it writes to a scalar carry output).


Added support for them (you were right, there was some special handling 
needed!)





Did this lead me to find an ICE with:

uint16x8_t  test (uint16_t *a, int n)
{
  uint16x8_t res = vdupq_n_u16 (0);
  while (n > 0)
    {
  uint16_t carry = 0;
  mve_pred16_t p = vctp16q (n);
  uint16x8_t va = vldrhq_u16 (a);
  res = vshlcq_u16 (va, , 1);
  res = vshlcq_m_u16 (res, , 1 , p);
  a += 8;
  n -= 8;
    }
  return res;
}

This is because:
+  /* If the USE is outside the loop body bb, or it is inside, 
but

+ is an unpredicated store to memory.  */
+  if (BLOCK_FOR_INSN (insn) != BLOCK_FOR_INSN (next_use_insn)
+ || (arm_mve_vec_insn_is_unpredicated_or_uses_other_predicate
+ (next_use_insn, vctp_vpr_generated)
+    && mve_memory_operand
+    (SET_DEST (single_set (next_use_insn)),
+ GET_MODE (SET_DEST (single_set (next_use_insn))
+    return true;

Assumes single_set doesn't return 0.


Thanks! That is indeed correct.

Corrected this by having a utility function to scan insn operands and 
check against mve_memory_operand that supports any number of 
operands/SETs in the insn




Let's deal with these issues and I'll continue to review.

On 15/06/2023 12:47, Stamatis Markianos-Wright via Gcc-patches wrote:

 Hi all,

 This is 

Re: [PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops

2023-07-05 Thread Stamatis Markianos-Wright via Gcc-patches
Thank you Andre for reviewing! I'll attach the updated version of the 
patch to the third review email (your final one thus far ;)


On 22/06/2023 16:54, Andre Vieira (lists) wrote:
Some comments below, all quite minor. I'll continue to review 
tomorrow, I need a fresher brain for 
arm_mve_check_df_chain_back_for_implic_predic  ;)


+static int
+arm_mve_get_vctp_lanes (rtx x)
+{
+  if (GET_CODE (x) == SET && GET_CODE (XEXP (x, 1)) == UNSPEC
+  && (XINT (XEXP (x, 1), 1) == VCTP || XINT (XEXP (x, 1), 1) == 
VCTP_M))

+    {
+  switch (GET_MODE (XEXP (x, 1)))
+    {
+  case V16BImode:
+    return 16;
+  case V8BImode:
+    return 8;
+  case V4BImode:
+    return 4;
+  case V2QImode:
+    return 2;
+  default:
+    break;
+    }
+    }
+  return 0;
+}

I think you can replace the switch with something along the lines of:
machine_mode mode = GET_MODE (XEXP (x, 1));
return VECTOR_MODE_P (mode) ? GET_MODE_NUNITS (mode) : 0;


Ah true, especially now that there are no HImode predicates!

I added an additional check of `&& VALID_MVE_PRED_MODE (mode)` as well, 
just to make sure we could never pick up V4SImode, etc. (although I'd 
never expect that to happen if `rtx x` came from a valid instruction)





+/* Check if an insn requires the use of the VPR_REG, if it does, 
return the

+   sub-rtx of the VPR_REG.  The `type` argument controls whether
+   this function should:
+   * For type == 0, check all operands, including the OUT operands,
+ and return the first occurance of the VPR_REG.

s/occurance/occurrence/

Done


+  bool requires_vpr;
+  extract_constrain_insn (insn);

indent of requires_vpr is off.

Done


+  if (type == 1 && (recog_data.operand_type[op] == OP_OUT
+    || recog_data.operand_type[op] == OP_INOUT))
+    continue;
+  else if (type == 2 && (recog_data.operand_type[op] == OP_IN
+ || recog_data.operand_type[op] == OP_INOUT))
+    continue;

Why skip INOUT? I guess this will become clear when I see the uses, 
but I'm wondering whether 'only check the input operands.' is clear 
enough. Maybe 'check operands that are input only.' would be more 
accurate?
Oh! Thanks for spotting this. It also doesn't work with my comment at 
the top:

`(INOUT operands are considered both as input and output operands)`

It's been a long time since I wrote this piece, but it might be that I 
added this after realising that there are no insns with an OP_INOUT VPR 
reg. Since I don't think it's functional, I changed the code to align 
with the comment, instead.




+  /* Fetch the reg_class for each entry and check it against the
+   * VPR_REG reg_class.  */

Remove leading * on the second line.

Damn auto-formatters ;)
Done


+
+/* Wrapper function of arm_get_required_vpr_reg with type == 1, so 
return

+   something only if the VPR reg is an input operand to the insn.  */

When talking about a function parameter in comments capitalize (INSN) 
the name. Same for:

Done


+/* Wrapper function of arm_get_required_vpr_reg with type == 2, so 
return
+   something only if the VPR reg is the retrurn value, an output of, 
or is

+   clobbered by the insn.  */

+/* Return true if an insn is an MVE instruction that VPT-predicable, 
but in
+   its unpredicated form, or if it is predicated, but on a predicate 
other

+   than vpr_reg.  */

In this one also 'is a MVE instruction that is VPT-predicable' would 
be better I think.

Oops, thanks for spotting. Done.



On 15/06/2023 12:47, Stamatis Markianos-Wright via Gcc-patches wrote:
>  Hi all,
>
>  This is the 2/2 patch that contains the functional changes needed
>  for MVE Tail Predicated Low Overhead Loops.  See my previous email
>  for a general introduction of MVE LOLs.
>
>  This support is added through the already existing loop-doloop
>  mechanisms that are used for non-MVE dls/le looping.
>
>  Mid-end changes are:
>
>  1) Relax the loop-doloop mechanism in the mid-end to allow for
> decrement numbers other that -1 and for `count` to be an
> rtx containing a simple REG (which in this case will contain
> the number of elements to be processed), rather
> than an expression for calculating the number of iterations.
>  2) Added a new df utility function: `df_bb_regno_only_def_find` 
that
> will return the DEF of a REG only if it is DEF-ed once 
within the

> basic block.
>
>  And many things in the backend to implement the above 
optimisation:

>
>  3)  Implement the `arm_predict_doloop_p` target hook to 
instruct the

>  mid-end about Low Overhead Loops (MVE or not), as well as
>  `arm_loop_unroll_adjust` which will prevent unrolling of 
any loops
>  that are valid for becoming MVE Tail_Predicated Low 
Overhead Loops
>  (unrolling can transform a loop in ways that invalidate the 
dlstp/
>  letp tranformation logic and the benefit of the dlstp/letp 
loop


Re: [PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops

2023-06-23 Thread Andre Vieira (lists) via Gcc-patches

+  /* In order to find out if the loop is of type A or B above look for the
+ loop counter: it will either be incrementing by one per iteration or
+ it will be decrementing by num_of_lanes.  We can find the loop counter
+ in the condition at the end of the loop.  */
+  rtx_insn *loop_cond = prev_nonnote_nondebug_insn_bb (BB_END (body));
+  gcc_assert (cc_register (XEXP (PATTERN (loop_cond), 0), VOIDmode)
+ && GET_CODE (XEXP (PATTERN (loop_cond), 1)) == COMPARE);

Not sure this should be an assert. If we do encounter a differently 
formed loop, we should bail out of DLSTPing for now but we shouldn't ICE.



+  /* The loop latch has to be empty.  When compiling all the known MVE 
LoLs in
+ user applications, none of those with incrementing counters had 
any real
+ insns in the loop latch.  As such, this function has only been 
tested with

+ an empty latch and may misbehave or ICE if we somehow get here with an
+ increment in the latch, so, for sanity, error out early.  */
+  rtx_insn *dec_insn = BB_END (body->loop_father->latch);
+  if (NONDEBUG_INSN_P (dec_insn))
+gcc_unreachable ();

Similarly here I'd return false rather than gcc_unreachable ();


+  /* Find where both of those are modified in the loop body bb.  */
+  rtx condcount_reg_set = PATTERN (DF_REF_INSN (df_bb_regno_only_def_find
+(body, REGNO (condcount;
Put = on newline, breaks it down nicer.

+ counter_orig_set = XEXP (PATTERN
+   (DF_REF_INSN
+ (DF_REF_NEXT_REG
+   (DF_REG_DEF_CHAIN
+(REGNO
+  (XEXP (condcount_reg_set, 0)), 
1);

This makes me a bit nervous, can we be certain that the PATTERN of the 
next insn that sets it is indeed a set. Heck can we even be sure 
DF_REG_DEF_CHAIN returns a non-null, I can't imagine why not but maybe 
there are some constructs it can't follow-up on? Might just be worth 
checking these steps and bailing out.




+  /* When we find the vctp instruction: This may be followed by
+  a zero-extend insn to SImode.  If it is, then save the
+  zero-extended REG into vctp_vpr_generated.  If there is no
+  zero-extend, then store the raw output of the vctp.
+  For any VPT-predicated instructions we need to ensure that
+  the VPR they use is the same as the one given here and
+  they often consume the output of a subreg of the SImode
+  zero-extended VPR-reg.  As a result, comparing against the
+  output of the zero-extend is more likely to succeed.
+  This code also guarantees to us that the vctp comes before
+  any instructions that use the VPR within the loop, for the
+  dlstp/letp transform to succeed.  */

Wrong comment indent after first line.

+  rtx_insn *vctp_insn = arm_mve_get_loop_vctp (body);
+  if (!vctp_insn || !arm_mve_loop_valid_for_dlstp (body))
+return GEN_INT (1);

arm_mve_loop_valid_for_dlstp already calls arm_mve_get_loop_vctp, maybe 
have 'arm_mve_loop_valid_for_dlstp' return vctp_insn or null to 
determine success or failure, avoids looping through the BB again.


For the same reason I'd also pass vctp_insn down to 
'arm_mve_check_df_chain_back_for_implic_predic'.


+ if (GET_CODE (SET_SRC (single_set (next_use1))) == ZERO_EXTEND)
+   {
+ rtx_insn *next_use2 = NULL;

Are we sure single_set can never return 0 here? Maybe worth an extra 
check and bail out if it does?


+   /* If the insn pattern requires the use of the VPR value from the
+ vctp as an input parameter.  */
s/an an input parameter./as an input parameter for predication./

+ /* None of registers USE-d by the instruction need can be the VPR
+vctp_vpr_generated.  This blocks the optimisation if there any
+instructions that use the optimised-out VPR value in any way
+other than as a VPT block predicate.  */

Reword this slightly to be less complex:
This instruction USE-s the vctp_vpr_generated other than for 
predication, this blocks the transformation as we are not allowed to 
optimise the VPR value away.


Will continue reviewing next week :)

On 15/06/2023 12:47, Stamatis Markianos-Wright via Gcc-patches wrote:

     Hi all,

     This is the 2/2 patch that contains the functional changes needed
     for MVE Tail Predicated Low Overhead Loops.  See my previous email
     for a general introduction of MVE LOLs.

     This support is added through the already existing loop-doloop
     mechanisms that are used for non-MVE dls/le looping.

     Mid-end changes are:

     1) Relax the loop-doloop mechanism in the mid-end to allow for
    decrement numbers other that -1 and for `count` to be an
    rtx containing a simple REG (which in this case will contain
    the number of elements to be processed), 

Re: [PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops

2023-06-23 Thread Andre Vieira (lists) via Gcc-patches

+  if (insn != arm_mve_get_loop_vctp (body))
+{

probably a good idea to invert the condition here and return false, 
helps reducing the indenting in this function.



+   /* Starting from the current insn, scan backwards through the insn
+  chain until BB_HEAD: "for each insn in the BB prior to the current".
+   */

There's a trailing whitespace after insn, but also I'd rewrite this bit. 
The "for each insn in the BB prior to the current" is superfluous and 
even confusing to me. How about:
"Scan backwards from the current INSN through the instruction chain 
until the start of the basic block.  "



 I find 'that previous insn' to be confusing as you don't mention any 
previous insn before. So how about something along the lines of:
'If a previous insn defines a register that INSN uses then return true 
if...'



Do we need to check: 'insn != prev_insn' ? Any reason why you can't 
start the loop with:

'for (rtx_insn *prev_insn = PREV_INSN (insn);'

Now I also found a case where things might go wrong in:
+   /* Look at all the DEFs of that previous insn: if one of them is on
+  the same REG as our current insn, then recurse in order to check
+  that insn's USEs.  If any of these insns return true as
+  MVE_VPT_UNPREDICATED_INSN_Ps, then the whole chain is affected
+  by the change in behaviour from being placed in dlstp/letp loop.
+   */
+   df_ref prev_insn_defs = NULL;
+   FOR_EACH_INSN_DEF (prev_insn_defs, prev_insn)
+ {
+   if (DF_REF_REGNO (insn_uses) == DF_REF_REGNO (prev_insn_defs)
+   && insn != prev_insn
+   && body == BLOCK_FOR_INSN (prev_insn)
+   && !arm_mve_vec_insn_is_predicated_with_this_predicate
+(insn, vctp_vpr_generated)
+   && arm_mve_check_df_chain_back_for_implic_predic
+(prev_insn, vctp_vpr_generated))
+ return true;
+ }

The body == BLOCK_FOR_INSN (prev_insn) hinted me at it, if a def comes 
from outside of the BB (so outside of the loop's body) then its by 
definition unpredicated by vctp.  I think you want to check that if 
prev_insn defines a register used by insn then return true if prev_insn 
isn't in the same BB or has a chain that is not predicated, i.e.: 
'!arm_mve_vec_insn_is_predicated_with_this_predicate (insn, 
vctp_vpr_generated) && arm_mve_check_df_chain_back_for_implic_predic 
prev_insn, vctp_vpr_generated))' you check body != BLOCK_FOR_INSN 
(prev_insn)'



I also found some other issues, this currently loloops:

uint16_t  test (uint16_t *a, int n)
{
  uint16_t res =0;
  while (n > 0)
{
  mve_pred16_t p = vctp16q (n);
  uint16x8_t va = vldrhq_u16 (a);
  res = vaddvaq_u16 (res, va);
  res = vaddvaq_p_u16 (res, va, p);
  a += 8;
  n -= 8;
}
  return res;
}

But it shouldn't, this is because there's a lack of handling of across 
vector instructions. Luckily in MVE all across vector instructions have 
the side-effect that they write to a scalar register, even the vshlcq 
instruction (it writes to a scalar carry output).


Did this lead me to find an ICE with:

uint16x8_t  test (uint16_t *a, int n)
{
  uint16x8_t res = vdupq_n_u16 (0);
  while (n > 0)
{
  uint16_t carry = 0;
  mve_pred16_t p = vctp16q (n);
  uint16x8_t va = vldrhq_u16 (a);
  res = vshlcq_u16 (va, , 1);
  res = vshlcq_m_u16 (res, , 1 , p);
  a += 8;
  n -= 8;
}
  return res;
}

This is because:
+ /* If the USE is outside the loop body bb, or it is inside, but
+is an unpredicated store to memory.  */
+ if (BLOCK_FOR_INSN (insn) != BLOCK_FOR_INSN (next_use_insn)
+|| (arm_mve_vec_insn_is_unpredicated_or_uses_other_predicate
+(next_use_insn, vctp_vpr_generated)
+   && mve_memory_operand
+   (SET_DEST (single_set (next_use_insn)),
+GET_MODE (SET_DEST (single_set (next_use_insn))
+   return true;

Assumes single_set doesn't return 0.

Let's deal with these issues and I'll continue to review.

On 15/06/2023 12:47, Stamatis Markianos-Wright via Gcc-patches wrote:

     Hi all,

     This is the 2/2 patch that contains the functional changes needed
     for MVE Tail Predicated Low Overhead Loops.  See my previous email
     for a general introduction of MVE LOLs.

     This support is added through the already existing loop-doloop
     mechanisms that are used for non-MVE dls/le looping.

     Mid-end changes are:

     1) Relax the loop-doloop mechanism in the mid-end to allow for
    decrement numbers other that -1 and for `count` to be an
    rtx containing a simple REG (which in this case will contain
    the number of elements to be processed), rather
    than an expression for calculating the number of iterations.
  

Re: [PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops

2023-06-22 Thread Andre Vieira (lists) via Gcc-patches
Some comments below, all quite minor. I'll continue to review tomorrow, 
I need a fresher brain for arm_mve_check_df_chain_back_for_implic_predic 
 ;)


+static int
+arm_mve_get_vctp_lanes (rtx x)
+{
+  if (GET_CODE (x) == SET && GET_CODE (XEXP (x, 1)) == UNSPEC
+  && (XINT (XEXP (x, 1), 1) == VCTP || XINT (XEXP (x, 1), 1) == 
VCTP_M))

+{
+  switch (GET_MODE (XEXP (x, 1)))
+   {
+ case V16BImode:
+   return 16;
+ case V8BImode:
+   return 8;
+ case V4BImode:
+   return 4;
+ case V2QImode:
+   return 2;
+ default:
+   break;
+   }
+}
+  return 0;
+}

I think you can replace the switch with something along the lines of:
machine_mode mode = GET_MODE (XEXP (x, 1));
return VECTOR_MODE_P (mode) ? GET_MODE_NUNITS (mode) : 0;


+/* Check if an insn requires the use of the VPR_REG, if it does, return the
+   sub-rtx of the VPR_REG.  The `type` argument controls whether
+   this function should:
+   * For type == 0, check all operands, including the OUT operands,
+ and return the first occurance of the VPR_REG.

s/occurance/occurrence/

+ bool requires_vpr;
+  extract_constrain_insn (insn);

indent of requires_vpr is off.

+  if (type == 1 && (recog_data.operand_type[op] == OP_OUT
+   || recog_data.operand_type[op] == OP_INOUT))
+   continue;
+  else if (type == 2 && (recog_data.operand_type[op] == OP_IN
+|| recog_data.operand_type[op] == OP_INOUT))
+   continue;

Why skip INOUT? I guess this will become clear when I see the uses, but 
I'm wondering whether 'only check the input operands.' is clear enough. 
Maybe 'check operands that are input only.' would be more accurate?


+ /* Fetch the reg_class for each entry and check it against the
+  * VPR_REG reg_class.  */

Remove leading * on the second line.

+
+/* Wrapper function of arm_get_required_vpr_reg with type == 1, so return
+   something only if the VPR reg is an input operand to the insn.  */

When talking about a function parameter in comments capitalize (INSN) 
the name. Same for:


+/* Wrapper function of arm_get_required_vpr_reg with type == 2, so return
+   something only if the VPR reg is the retrurn value, an output of, or is
+   clobbered by the insn.  */

+/* Return true if an insn is an MVE instruction that VPT-predicable, but in
+   its unpredicated form, or if it is predicated, but on a predicate other
+   than vpr_reg.  */

In this one also 'is a MVE instruction that is VPT-predicable' would be 
better I think.



On 15/06/2023 12:47, Stamatis Markianos-Wright via Gcc-patches wrote:
>  Hi all,
>
>  This is the 2/2 patch that contains the functional changes needed
>  for MVE Tail Predicated Low Overhead Loops.  See my previous email
>  for a general introduction of MVE LOLs.
>
>  This support is added through the already existing loop-doloop
>  mechanisms that are used for non-MVE dls/le looping.
>
>  Mid-end changes are:
>
>  1) Relax the loop-doloop mechanism in the mid-end to allow for
> decrement numbers other that -1 and for `count` to be an
> rtx containing a simple REG (which in this case will contain
> the number of elements to be processed), rather
> than an expression for calculating the number of iterations.
>  2) Added a new df utility function: `df_bb_regno_only_def_find` that
> will return the DEF of a REG only if it is DEF-ed once within the
> basic block.
>
>  And many things in the backend to implement the above optimisation:
>
>  3)  Implement the `arm_predict_doloop_p` target hook to instruct the
>  mid-end about Low Overhead Loops (MVE or not), as well as
>  `arm_loop_unroll_adjust` which will prevent unrolling of any 
loops
>  that are valid for becoming MVE Tail_Predicated Low Overhead 
Loops
>  (unrolling can transform a loop in ways that invalidate the 
dlstp/

>  letp tranformation logic and the benefit of the dlstp/letp loop
>  would be considerably higher than that of unrolling)
>  4)  Appropriate changes to the define_expand of doloop_end, new
>  patterns for dlstp and letp, new iterators,  unspecs, etc.
>  5) `arm_mve_loop_valid_for_dlstp` and a number of checking 
functions:

> * `arm_mve_dlstp_check_dec_counter`
> * `arm_mve_dlstp_check_inc_counter`
> * `arm_mve_check_reg_origin_is_num_elems`
> * `arm_mve_check_df_chain_back_for_implic_predic`
> * `arm_mve_check_df_chain_fwd_for_implic_predic_impact`
> This all, in smoe way or another, are running checks on the loop
> structure in order to determine if the loop is valid for 
dlstp/letp

> transformation.
>  6) `arm_attempt_dlstp_transform`: (called from the define_expand of
>  doloop_end) this function re-checks for the loop's 
suitability for


[PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops

2023-06-15 Thread Stamatis Markianos-Wright via Gcc-patches

    Hi all,

    This is the 2/2 patch that contains the functional changes needed
    for MVE Tail Predicated Low Overhead Loops.  See my previous email
    for a general introduction of MVE LOLs.

    This support is added through the already existing loop-doloop
    mechanisms that are used for non-MVE dls/le looping.

    Mid-end changes are:

    1) Relax the loop-doloop mechanism in the mid-end to allow for
   decrement numbers other that -1 and for `count` to be an
   rtx containing a simple REG (which in this case will contain
   the number of elements to be processed), rather
   than an expression for calculating the number of iterations.
    2) Added a new df utility function: `df_bb_regno_only_def_find` that
   will return the DEF of a REG only if it is DEF-ed once within the
   basic block.

    And many things in the backend to implement the above optimisation:

    3)  Implement the `arm_predict_doloop_p` target hook to instruct the
    mid-end about Low Overhead Loops (MVE or not), as well as
    `arm_loop_unroll_adjust` which will prevent unrolling of any loops
    that are valid for becoming MVE Tail_Predicated Low Overhead Loops
    (unrolling can transform a loop in ways that invalidate the dlstp/
    letp tranformation logic and the benefit of the dlstp/letp loop
    would be considerably higher than that of unrolling)
    4)  Appropriate changes to the define_expand of doloop_end, new
    patterns for dlstp and letp, new iterators,  unspecs, etc.
    5) `arm_mve_loop_valid_for_dlstp` and a number of checking functions:
   * `arm_mve_dlstp_check_dec_counter`
   * `arm_mve_dlstp_check_inc_counter`
   * `arm_mve_check_reg_origin_is_num_elems`
   * `arm_mve_check_df_chain_back_for_implic_predic`
   * `arm_mve_check_df_chain_fwd_for_implic_predic_impact`
   This all, in smoe way or another, are running checks on the loop
   structure in order to determine if the loop is valid for dlstp/letp
   transformation.
    6) `arm_attempt_dlstp_transform`: (called from the define_expand of
    doloop_end) this function re-checks for the loop's suitability for
    dlstp/letp transformation and then implements it, if possible.
    7) Various utility functions:
   *`arm_mve_get_vctp_lanes` to map
   from vctp unspecs to number of lanes, and `arm_get_required_vpr_reg`
   to check an insn to see if it requires the VPR or not.
   * `arm_mve_get_loop_vctp`
   * `arm_mve_get_vctp_lanes`
   * `arm_emit_mve_unpredicated_insn_to_seq`
   * `arm_get_required_vpr_reg`
   * `arm_get_required_vpr_reg_param`
   * `arm_get_required_vpr_reg_ret_val`
   * `arm_mve_vec_insn_is_predicated_with_this_predicate`
   * `arm_mve_vec_insn_is_unpredicated_or_uses_other_predicate`

    No regressions on arm-none-eabi with various targets and on
    aarch64-none-elf. Thoughts on getting this into trunk?

    Thank you,
    Stam Markianos-Wright

    gcc/ChangeLog:

    * config/arm/arm-protos.h (arm_target_insn_ok_for_lob): 
Rename to...

    (arm_target_bb_ok_for_lob): ...this
    (arm_attempt_dlstp_transform): New.
    * config/arm/arm.cc (TARGET_LOOP_UNROLL_ADJUST): New.
    (TARGET_PREDICT_DOLOOP_P): New.
    (arm_block_set_vect):
    (arm_target_insn_ok_for_lob): Rename from 
arm_target_insn_ok_for_lob.

    (arm_target_bb_ok_for_lob): New.
    (arm_mve_get_vctp_lanes): New.
    (arm_get_required_vpr_reg): New.
    (arm_get_required_vpr_reg_param): New.
    (arm_get_required_vpr_reg_ret_val): New.
    (arm_mve_get_loop_vctp): New.
(arm_mve_vec_insn_is_unpredicated_or_uses_other_predicate): New.
    (arm_mve_vec_insn_is_predicated_with_this_predicate): New.
    (arm_mve_check_df_chain_back_for_implic_predic): New.
    (arm_mve_check_df_chain_fwd_for_implic_predic_impact): New.
    (arm_mve_check_reg_origin_is_num_elems): New.
    (arm_mve_dlstp_check_inc_counter): New.
    (arm_mve_dlstp_check_dec_counter): New.
    (arm_mve_loop_valid_for_dlstp): New.
    (arm_predict_doloop_p): New.
    (arm_loop_unroll_adjust): New.
    (arm_emit_mve_unpredicated_insn_to_seq): New.
    (arm_attempt_dlstp_transform): New.
    * config/arm/iterators.md (DLSTP): New.
    (mode1): Add DLSTP mappings.
    * config/arm/mve.md (*predicated_doloop_end_internal): New.
    (dlstp_insn): New.
    * config/arm/thumb2.md (doloop_end): Update for MVE LOLs.
    * config/arm/unspecs.md: New unspecs.
    * df-core.cc (df_bb_regno_only_def_find): New.
    * df.h (df_bb_regno_only_def_find): New.
    * loop-doloop.cc (doloop_condition_get): Relax conditions.
    (doloop_optimize): Add support for elementwise LoLs.

    gcc/testsuite/ChangeLog:

    * 

[PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops

2022-11-28 Thread Stam Markianos-Wright via Gcc-patches


On 11/15/22 15:51, Andre Vieira (lists) wrote:


On 11/11/2022 17:40, Stam Markianos-Wright via Gcc-patches wrote:

Hi all,

This is the 2/2 patch that contains the functional changes needed
for MVE Tail Predicated Low Overhead Loops.  See my previous email
for a general introduction of MVE LOLs.

This support is added through the already existing loop-doloop
mechanisms that are used for non-MVE dls/le looping.

Changes are:

1) Relax the loop-doloop mechanism in the mid-end to allow for
   decrement numbers other that -1 and for `count` to be an
   rtx containing the number of elements to be processed, rather
   than an expression for calculating the number of iterations.
2) Add a `allow_elementwise_doloop` target hook. This allows the
   target backend to manipulate the iteration count as it needs:
   in our case to change it from a pre-calculation of the number
   of iterations to the number of elements to be processed.
3) The doloop_end target-insn now had an additional parameter:
   the `count` (note: this is before it gets modified to just be
   the number of elements), so that the decrement value is
   extracted from that parameter.

And many things in the backend to implement the above optimisation:

4)  Appropriate changes to the define_expand of doloop_end and new
    patterns for dlstp and letp.
5) `arm_attempt_dlstp_transform`: (called from the define_expand of
    doloop_end) this function checks for the loop's suitability for
    dlstp/letp transformation and then implements it, if possible.
6) `arm_mve_get_loop_unique_vctp`: A function that loops through
    the loop contents and returns the vctp VPR-genereting operation
    within the loop, if it is unique and there is exclusively one
    vctp within the loop.
7) A couple of utility functions: `arm_mve_get_vctp_lanes` to map
   from vctp unspecs to number of lanes, and `arm_get_required_vpr_reg`
   to check an insn to see if it requires the VPR or not.

No regressions on arm-none-eabi with various targets and on
aarch64-none-elf. Thoughts on getting this into trunk?

Thank you,
Stam Markianos-Wright

gcc/ChangeLog:

    * config/aarch64/aarch64.md: Add extra doloop_end arg.
    * config/arm/arm-protos.h (arm_attempt_dlstp_transform): New.
    * config/arm/arm.cc (TARGET_ALLOW_ELEMENTWISE_DOLOOP): New.
    (arm_mve_get_vctp_lanes): New.
    (arm_get_required_vpr_reg): New.
    (arm_mve_get_loop_unique_vctp): New.
    (arm_attempt_dlstp_transform): New.
    (arm_allow_elementwise_doloop): New.
    * config/arm/iterators.md:
    * config/arm/mve.md (*predicated_doloop_end_internal): New.
    (dlstp_insn): New.
    * config/arm/thumb2.md (doloop_end): Update for MVE LOLs.
    * config/arm/unspecs.md: New unspecs.
    * config/ia64/ia64.md: Add extra doloop_end arg.
    * config/pru/pru.md: Add extra doloop_end arg.
    * config/rs6000/rs6000.md: Add extra doloop_end arg.
    * config/s390/s390.md: Add extra doloop_end arg.
    * config/v850/v850.md: Add extra doloop_end arg.
    * doc/tm.texi: Document new hook.
    * doc/tm.texi.in: Likewise.
    * loop-doloop.cc (doloop_condition_get): Relax conditions.
    (doloop_optimize): Add support for elementwise LoLs.
    * target-insns.def (doloop_end): Add extra arg.
    * target.def (allow_elementwise_doloop): New hook.
    * targhooks.cc (default_allow_elementwise_doloop): New.
    * targhooks.h (default_allow_elementwise_doloop): New.

gcc/testsuite/ChangeLog:

    * gcc.target/arm/lob.h: Update framework.
    * gcc.target/arm/lob1.c: Likewise.
    * gcc.target/arm/lob6.c: Likewise.
    * gcc.target/arm/dlstp-int16x8.c: New test.
    * gcc.target/arm/dlstp-int32x4.c: New test.
    * gcc.target/arm/dlstp-int64x2.c: New test.
    * gcc.target/arm/dlstp-int8x16.c: New test.


### Inline copy of patch ###

diff --git a/gcc/config/aarch64/aarch64.md 
b/gcc/config/aarch64/aarch64.md
index 
f2e3d905dbbeb2949f2947f5cfd68208c94c9272..7a6d24a80060b4a704a481ccd1a32d96e7b0f369 
100644

--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -7366,7 +7366,8 @@
 ;; knows what to generate.
 (define_expand "doloop_end"
   [(use (match_operand 0 "" ""))  ; loop pseudo
-   (use (match_operand 1 "" ""))] ; label
+   (use (match_operand 1 "" ""))  ; label
+   (use (match_operand 2 "" ""))] ; decrement constant
   "optimize > 0 && flag_modulo_sched"
 {
   rtx s0;
diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index 
550272facd12e60a49bf8a3b20f811cc13765b3a..7684620f0f4d161dd9e9ad2d70308021ec3d3d34 
100644

--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -63,7 +63,7 @@ extern void arm_decompose_di_binop (rtx, rtx, rtx 
*, rtx *, rtx *, rtx *);

 extern bool arm_q_bit_access (void);
 extern bool arm_ge_bits_access (void);
 extern bool arm_target_insn_ok_for_lob (rtx);
-
+extern rtx 

Re: [PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops

2022-11-15 Thread Andre Vieira (lists) via Gcc-patches



On 11/11/2022 17:40, Stam Markianos-Wright via Gcc-patches wrote:

Hi all,

This is the 2/2 patch that contains the functional changes needed
for MVE Tail Predicated Low Overhead Loops.  See my previous email
for a general introduction of MVE LOLs.

This support is added through the already existing loop-doloop
mechanisms that are used for non-MVE dls/le looping.

Changes are:

1) Relax the loop-doloop mechanism in the mid-end to allow for
   decrement numbers other that -1 and for `count` to be an
   rtx containing the number of elements to be processed, rather
   than an expression for calculating the number of iterations.
2) Add a `allow_elementwise_doloop` target hook. This allows the
   target backend to manipulate the iteration count as it needs:
   in our case to change it from a pre-calculation of the number
   of iterations to the number of elements to be processed.
3) The doloop_end target-insn now had an additional parameter:
   the `count` (note: this is before it gets modified to just be
   the number of elements), so that the decrement value is
   extracted from that parameter.

And many things in the backend to implement the above optimisation:

4)  Appropriate changes to the define_expand of doloop_end and new
    patterns for dlstp and letp.
5) `arm_attempt_dlstp_transform`: (called from the define_expand of
    doloop_end) this function checks for the loop's suitability for
    dlstp/letp transformation and then implements it, if possible.
6) `arm_mve_get_loop_unique_vctp`: A function that loops through
    the loop contents and returns the vctp VPR-genereting operation
    within the loop, if it is unique and there is exclusively one
    vctp within the loop.
7) A couple of utility functions: `arm_mve_get_vctp_lanes` to map
   from vctp unspecs to number of lanes, and `arm_get_required_vpr_reg`
   to check an insn to see if it requires the VPR or not.

No regressions on arm-none-eabi with various targets and on
aarch64-none-elf. Thoughts on getting this into trunk?

Thank you,
Stam Markianos-Wright

gcc/ChangeLog:

    * config/aarch64/aarch64.md: Add extra doloop_end arg.
    * config/arm/arm-protos.h (arm_attempt_dlstp_transform): New.
    * config/arm/arm.cc (TARGET_ALLOW_ELEMENTWISE_DOLOOP): New.
    (arm_mve_get_vctp_lanes): New.
    (arm_get_required_vpr_reg): New.
    (arm_mve_get_loop_unique_vctp): New.
    (arm_attempt_dlstp_transform): New.
    (arm_allow_elementwise_doloop): New.
    * config/arm/iterators.md:
    * config/arm/mve.md (*predicated_doloop_end_internal): New.
    (dlstp_insn): New.
    * config/arm/thumb2.md (doloop_end): Update for MVE LOLs.
    * config/arm/unspecs.md: New unspecs.
    * config/ia64/ia64.md: Add extra doloop_end arg.
    * config/pru/pru.md: Add extra doloop_end arg.
    * config/rs6000/rs6000.md: Add extra doloop_end arg.
    * config/s390/s390.md: Add extra doloop_end arg.
    * config/v850/v850.md: Add extra doloop_end arg.
    * doc/tm.texi: Document new hook.
    * doc/tm.texi.in: Likewise.
    * loop-doloop.cc (doloop_condition_get): Relax conditions.
    (doloop_optimize): Add support for elementwise LoLs.
    * target-insns.def (doloop_end): Add extra arg.
    * target.def (allow_elementwise_doloop): New hook.
    * targhooks.cc (default_allow_elementwise_doloop): New.
    * targhooks.h (default_allow_elementwise_doloop): New.

gcc/testsuite/ChangeLog:

    * gcc.target/arm/lob.h: Update framework.
    * gcc.target/arm/lob1.c: Likewise.
    * gcc.target/arm/lob6.c: Likewise.
    * gcc.target/arm/dlstp-int16x8.c: New test.
    * gcc.target/arm/dlstp-int32x4.c: New test.
    * gcc.target/arm/dlstp-int64x2.c: New test.
    * gcc.target/arm/dlstp-int8x16.c: New test.


### Inline copy of patch ###

diff --git a/gcc/config/aarch64/aarch64.md 
b/gcc/config/aarch64/aarch64.md
index 
f2e3d905dbbeb2949f2947f5cfd68208c94c9272..7a6d24a80060b4a704a481ccd1a32d96e7b0f369 
100644

--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -7366,7 +7366,8 @@
 ;; knows what to generate.
 (define_expand "doloop_end"
   [(use (match_operand 0 "" ""))  ; loop pseudo
-   (use (match_operand 1 "" ""))] ; label
+   (use (match_operand 1 "" ""))  ; label
+   (use (match_operand 2 "" ""))] ; decrement constant
   "optimize > 0 && flag_modulo_sched"
 {
   rtx s0;
diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index 
550272facd12e60a49bf8a3b20f811cc13765b3a..7684620f0f4d161dd9e9ad2d70308021ec3d3d34 
100644

--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -63,7 +63,7 @@ extern void arm_decompose_di_binop (rtx, rtx, rtx *, 
rtx *, rtx *, rtx *);

 extern bool arm_q_bit_access (void);
 extern bool arm_ge_bits_access (void);
 extern bool arm_target_insn_ok_for_lob (rtx);
-
+extern rtx arm_attempt_dlstp_transform (rtx, rtx);
 #ifdef RTX_CODE
 enum reg_class

[PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops

2022-11-11 Thread Stam Markianos-Wright via Gcc-patches

Hi all,

This is the 2/2 patch that contains the functional changes needed
for MVE Tail Predicated Low Overhead Loops.  See my previous email
for a general introduction of MVE LOLs.

This support is added through the already existing loop-doloop
mechanisms that are used for non-MVE dls/le looping.

Changes are:

1) Relax the loop-doloop mechanism in the mid-end to allow for
   decrement numbers other that -1 and for `count` to be an
   rtx containing the number of elements to be processed, rather
   than an expression for calculating the number of iterations.
2) Add a `allow_elementwise_doloop` target hook. This allows the
   target backend to manipulate the iteration count as it needs:
   in our case to change it from a pre-calculation of the number
   of iterations to the number of elements to be processed.
3) The doloop_end target-insn now had an additional parameter:
   the `count` (note: this is before it gets modified to just be
   the number of elements), so that the decrement value is
   extracted from that parameter.

And many things in the backend to implement the above optimisation:

4)  Appropriate changes to the define_expand of doloop_end and new
    patterns for dlstp and letp.
5) `arm_attempt_dlstp_transform`: (called from the define_expand of
    doloop_end) this function checks for the loop's suitability for
    dlstp/letp transformation and then implements it, if possible.
6) `arm_mve_get_loop_unique_vctp`: A function that loops through
    the loop contents and returns the vctp VPR-genereting operation
    within the loop, if it is unique and there is exclusively one
    vctp within the loop.
7) A couple of utility functions: `arm_mve_get_vctp_lanes` to map
   from vctp unspecs to number of lanes, and `arm_get_required_vpr_reg`
   to check an insn to see if it requires the VPR or not.

No regressions on arm-none-eabi with various targets and on
aarch64-none-elf. Thoughts on getting this into trunk?

Thank you,
Stam Markianos-Wright

gcc/ChangeLog:

    * config/aarch64/aarch64.md: Add extra doloop_end arg.
    * config/arm/arm-protos.h (arm_attempt_dlstp_transform): New.
    * config/arm/arm.cc (TARGET_ALLOW_ELEMENTWISE_DOLOOP): New.
    (arm_mve_get_vctp_lanes): New.
    (arm_get_required_vpr_reg): New.
    (arm_mve_get_loop_unique_vctp): New.
    (arm_attempt_dlstp_transform): New.
    (arm_allow_elementwise_doloop): New.
    * config/arm/iterators.md:
    * config/arm/mve.md (*predicated_doloop_end_internal): New.
    (dlstp_insn): New.
    * config/arm/thumb2.md (doloop_end): Update for MVE LOLs.
    * config/arm/unspecs.md: New unspecs.
    * config/ia64/ia64.md: Add extra doloop_end arg.
    * config/pru/pru.md: Add extra doloop_end arg.
    * config/rs6000/rs6000.md: Add extra doloop_end arg.
    * config/s390/s390.md: Add extra doloop_end arg.
    * config/v850/v850.md: Add extra doloop_end arg.
    * doc/tm.texi: Document new hook.
    * doc/tm.texi.in: Likewise.
    * loop-doloop.cc (doloop_condition_get): Relax conditions.
    (doloop_optimize): Add support for elementwise LoLs.
    * target-insns.def (doloop_end): Add extra arg.
    * target.def (allow_elementwise_doloop): New hook.
    * targhooks.cc (default_allow_elementwise_doloop): New.
    * targhooks.h (default_allow_elementwise_doloop): New.

gcc/testsuite/ChangeLog:

    * gcc.target/arm/lob.h: Update framework.
    * gcc.target/arm/lob1.c: Likewise.
    * gcc.target/arm/lob6.c: Likewise.
    * gcc.target/arm/dlstp-int16x8.c: New test.
    * gcc.target/arm/dlstp-int32x4.c: New test.
    * gcc.target/arm/dlstp-int64x2.c: New test.
    * gcc.target/arm/dlstp-int8x16.c: New test.


### Inline copy of patch ###

diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 
f2e3d905dbbeb2949f2947f5cfd68208c94c9272..7a6d24a80060b4a704a481ccd1a32d96e7b0f369 
100644

--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -7366,7 +7366,8 @@
 ;; knows what to generate.
 (define_expand "doloop_end"
   [(use (match_operand 0 "" ""))  ; loop pseudo
-   (use (match_operand 1 "" ""))] ; label
+   (use (match_operand 1 "" ""))  ; label
+   (use (match_operand 2 "" ""))] ; decrement constant
   "optimize > 0 && flag_modulo_sched"
 {
   rtx s0;
diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index 
550272facd12e60a49bf8a3b20f811cc13765b3a..7684620f0f4d161dd9e9ad2d70308021ec3d3d34 
100644

--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -63,7 +63,7 @@ extern void arm_decompose_di_binop (rtx, rtx, rtx *, 
rtx *, rtx *, rtx *);

 extern bool arm_q_bit_access (void);
 extern bool arm_ge_bits_access (void);
 extern bool arm_target_insn_ok_for_lob (rtx);
-
+extern rtx arm_attempt_dlstp_transform (rtx, rtx);
 #ifdef RTX_CODE
 enum reg_class
 arm_mode_base_reg_class (machine_mode);
diff --git