Re: [PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops
Squashed the definition and changes to predicated_doloop_end_internal and dlstp*_insn into this patch to make sure the first patch builds independently On 18/12/2023 11:53, Andre Vieira wrote: Reworked Stam's patch after comments in: https://gcc.gnu.org/pipermail/gcc-patches/2023-December/640362.html The original gcc ChangeLog remains unchanged, but I did split up some tests so here is the testsuite ChangeLog. gcc/testsuite/ChangeLog: * gcc.target/arm/lob.h: Update framework. * gcc.target/arm/lob1.c: Likewise. * gcc.target/arm/lob6.c: Likewise. * gcc.target/arm/mve/dlstp-compile-asm.c: New test. * gcc.target/arm/mve/dlstp-int16x8.c: New test. * gcc.target/arm/mve/dlstp-int16x8-run.c: New test. * gcc.target/arm/mve/dlstp-int32x4.c: New test. * gcc.target/arm/mve/dlstp-int32x4-run.c: New test. * gcc.target/arm/mve/dlstp-int64x2.c: New test. * gcc.target/arm/mve/dlstp-int64x2-run.c: New test. * gcc.target/arm/mve/dlstp-int8x16.c: New test. * gcc.target/arm/mve/dlstp-int8x16-run.c: New test. * gcc.target/arm/mve/dlstp-invalid-asm.c: New test. diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h index 2f5ca79ed8ddd647b212782a0454ee4fefc07257..4f164c547406c43219900c111401540c7ef9d7d1 100644 --- a/gcc/config/arm/arm-protos.h +++ b/gcc/config/arm/arm-protos.h @@ -65,8 +65,8 @@ extern void arm_emit_speculation_barrier_function (void); extern void arm_decompose_di_binop (rtx, rtx, rtx *, rtx *, rtx *, rtx *); extern bool arm_q_bit_access (void); extern bool arm_ge_bits_access (void); -extern bool arm_target_insn_ok_for_lob (rtx); - +extern bool arm_target_bb_ok_for_lob (basic_block); +extern rtx arm_attempt_dlstp_transform (rtx); #ifdef RTX_CODE enum reg_class arm_mode_base_reg_class (machine_mode); diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc index 0c0cb14a8a4f043357b8acd7042a9f9386af1eb1..1ee72bcb7ec4bea5feea8453ceef7702b0088a73 100644 --- a/gcc/config/arm/arm.cc +++ b/gcc/config/arm/arm.cc @@ -668,6 +668,12 @@ static const scoped_attribute_specs *const arm_attribute_table[] = #undef TARGET_HAVE_CONDITIONAL_EXECUTION #define TARGET_HAVE_CONDITIONAL_EXECUTION arm_have_conditional_execution +#undef TARGET_LOOP_UNROLL_ADJUST +#define TARGET_LOOP_UNROLL_ADJUST arm_loop_unroll_adjust + +#undef TARGET_PREDICT_DOLOOP_P +#define TARGET_PREDICT_DOLOOP_P arm_predict_doloop_p + #undef TARGET_LEGITIMATE_CONSTANT_P #define TARGET_LEGITIMATE_CONSTANT_P arm_legitimate_constant_p @@ -34483,19 +34489,1147 @@ arm_invalid_within_doloop (const rtx_insn *insn) } bool -arm_target_insn_ok_for_lob (rtx insn) +arm_target_bb_ok_for_lob (basic_block bb) { - basic_block bb = BLOCK_FOR_INSN (insn); /* Make sure the basic block of the target insn is a simple latch having as single predecessor and successor the body of the loop itself. Only simple loops with a single basic block as body are supported for 'low over head loop' making sure that LE target is above LE itself in the generated code. */ - return single_succ_p (bb) -&& single_pred_p (bb) -&& single_succ_edge (bb)->dest == single_pred_edge (bb)->src -&& contains_no_active_insn_p (bb); +&& single_pred_p (bb) +&& single_succ_edge (bb)->dest == single_pred_edge (bb)->src; +} + +/* Utility fuction: Given a VCTP or a VCTP_M insn, return the number of MVE + lanes based on the machine mode being used. */ + +static int +arm_mve_get_vctp_lanes (rtx_insn *insn) +{ + rtx insn_set = single_set (insn); + if (insn_set + && GET_CODE (SET_SRC (insn_set)) == UNSPEC + && (XINT (SET_SRC (insn_set), 1) == VCTP + || XINT (SET_SRC (insn_set), 1) == VCTP_M)) +{ + machine_mode mode = GET_MODE (SET_SRC (insn_set)); + return (VECTOR_MODE_P (mode) && VALID_MVE_PRED_MODE (mode)) +? GET_MODE_NUNITS (mode) : 0; +} + return 0; +} + +/* Check if INSN requires the use of the VPR reg, if it does, return the + sub-rtx of the VPR reg. The TYPE argument controls whether + this function should: + * For TYPE == 0, check all operands, including the OUT operands, + and return the first occurrence of the VPR reg. + * For TYPE == 1, only check the input operands. + * For TYPE == 2, only check the output operands. + (INOUT operands are considered both as input and output operands) +*/ +static rtx +arm_get_required_vpr_reg (rtx_insn *insn, unsigned int type = 0) +{ + gcc_assert (type < 3); + if (!NONJUMP_INSN_P (insn)) +return NULL_RTX; + + bool requires_vpr; + extract_constrain_insn (insn); + int n_operands = recog_data.n_operands; + if (recog_data.n_alternatives == 0) +return NULL_RTX; + + /* Fill in recog_op_alt with information about the constraints of + this insn. */ + preprocess_constraints (insn); + + for (int op = 0; op < n_operands; op++) +{ + requires_vpr = true; + if (type == 1 &&
[PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops
Reworked Stam's patch after comments in: https://gcc.gnu.org/pipermail/gcc-patches/2023-December/640362.html The original gcc ChangeLog remains unchanged, but I did split up some tests so here is the testsuite ChangeLog. gcc/testsuite/ChangeLog: * gcc.target/arm/lob.h: Update framework. * gcc.target/arm/lob1.c: Likewise. * gcc.target/arm/lob6.c: Likewise. * gcc.target/arm/mve/dlstp-compile-asm.c: New test. * gcc.target/arm/mve/dlstp-int16x8.c: New test. * gcc.target/arm/mve/dlstp-int16x8-run.c: New test. * gcc.target/arm/mve/dlstp-int32x4.c: New test. * gcc.target/arm/mve/dlstp-int32x4-run.c: New test. * gcc.target/arm/mve/dlstp-int64x2.c: New test. * gcc.target/arm/mve/dlstp-int64x2-run.c: New test. * gcc.target/arm/mve/dlstp-int8x16.c: New test. * gcc.target/arm/mve/dlstp-int8x16-run.c: New test. * gcc.target/arm/mve/dlstp-invalid-asm.c: New test. diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h index 2f5ca79ed8d..4f164c54740 100644 --- a/gcc/config/arm/arm-protos.h +++ b/gcc/config/arm/arm-protos.h @@ -65,8 +65,8 @@ extern void arm_emit_speculation_barrier_function (void); extern void arm_decompose_di_binop (rtx, rtx, rtx *, rtx *, rtx *, rtx *); extern bool arm_q_bit_access (void); extern bool arm_ge_bits_access (void); -extern bool arm_target_insn_ok_for_lob (rtx); - +extern bool arm_target_bb_ok_for_lob (basic_block); +extern rtx arm_attempt_dlstp_transform (rtx); #ifdef RTX_CODE enum reg_class arm_mode_base_reg_class (machine_mode); diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc index 0c0cb14a8a4..1ee72bcb7ec 100644 --- a/gcc/config/arm/arm.cc +++ b/gcc/config/arm/arm.cc @@ -668,6 +668,12 @@ static const scoped_attribute_specs *const arm_attribute_table[] = #undef TARGET_HAVE_CONDITIONAL_EXECUTION #define TARGET_HAVE_CONDITIONAL_EXECUTION arm_have_conditional_execution +#undef TARGET_LOOP_UNROLL_ADJUST +#define TARGET_LOOP_UNROLL_ADJUST arm_loop_unroll_adjust + +#undef TARGET_PREDICT_DOLOOP_P +#define TARGET_PREDICT_DOLOOP_P arm_predict_doloop_p + #undef TARGET_LEGITIMATE_CONSTANT_P #define TARGET_LEGITIMATE_CONSTANT_P arm_legitimate_constant_p @@ -34483,19 +34489,1147 @@ arm_invalid_within_doloop (const rtx_insn *insn) } bool -arm_target_insn_ok_for_lob (rtx insn) +arm_target_bb_ok_for_lob (basic_block bb) { - basic_block bb = BLOCK_FOR_INSN (insn); /* Make sure the basic block of the target insn is a simple latch having as single predecessor and successor the body of the loop itself. Only simple loops with a single basic block as body are supported for 'low over head loop' making sure that LE target is above LE itself in the generated code. */ - return single_succ_p (bb) -&& single_pred_p (bb) -&& single_succ_edge (bb)->dest == single_pred_edge (bb)->src -&& contains_no_active_insn_p (bb); + && single_pred_p (bb) + && single_succ_edge (bb)->dest == single_pred_edge (bb)->src; +} + +/* Utility fuction: Given a VCTP or a VCTP_M insn, return the number of MVE + lanes based on the machine mode being used. */ + +static int +arm_mve_get_vctp_lanes (rtx_insn *insn) +{ + rtx insn_set = single_set (insn); + if (insn_set + && GET_CODE (SET_SRC (insn_set)) == UNSPEC + && (XINT (SET_SRC (insn_set), 1) == VCTP + || XINT (SET_SRC (insn_set), 1) == VCTP_M)) +{ + machine_mode mode = GET_MODE (SET_SRC (insn_set)); + return (VECTOR_MODE_P (mode) && VALID_MVE_PRED_MODE (mode)) + ? GET_MODE_NUNITS (mode) : 0; +} + return 0; +} + +/* Check if INSN requires the use of the VPR reg, if it does, return the + sub-rtx of the VPR reg. The TYPE argument controls whether + this function should: + * For TYPE == 0, check all operands, including the OUT operands, + and return the first occurrence of the VPR reg. + * For TYPE == 1, only check the input operands. + * For TYPE == 2, only check the output operands. + (INOUT operands are considered both as input and output operands) +*/ +static rtx +arm_get_required_vpr_reg (rtx_insn *insn, unsigned int type = 0) +{ + gcc_assert (type < 3); + if (!NONJUMP_INSN_P (insn)) +return NULL_RTX; + + bool requires_vpr; + extract_constrain_insn (insn); + int n_operands = recog_data.n_operands; + if (recog_data.n_alternatives == 0) +return NULL_RTX; + + /* Fill in recog_op_alt with information about the constraints of + this insn. */ + preprocess_constraints (insn); + + for (int op = 0; op < n_operands; op++) +{ + requires_vpr = true; + if (type == 1 && recog_data.operand_type[op] == OP_OUT) + continue; + else if (type == 2 && recog_data.operand_type[op] == OP_IN) + continue; + + /* Iterate through alternatives of operand "op" in recog_op_alt and + identify if the operand is required to be the VPR. */ + for (int alt = 0; alt < recog_data.n_alternatives; alt++) + { + const
Re: [PING][PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops
On 30/11/2023 12:55, Stamatis Markianos-Wright wrote: Hi Andre, Thanks for the comments, see latest revision attached. On 27/11/2023 12:47, Andre Vieira (lists) wrote: Hi Stam, Just some comments. +/* Recursively scan through the DF chain backwards within the basic block and + determine if any of the USEs of the original insn (or the USEs of the insns s/Recursively scan/Scan/ as you no longer recurse, thanks for that by the way :) + where thy were DEF-ed, etc., recursively) were affected by implicit VPT remove recursively for the same reasons. + if (!CONST_INT_P (cond_counter_iv.step) || !CONST_INT_P (cond_temp_iv.step)) + return NULL; + /* Look at the steps and swap around the rtx's if needed. Error out if + one of them cannot be identified as constant. */ + if (INTVAL (cond_counter_iv.step) != 0 && INTVAL (cond_temp_iv.step) != 0) + return NULL; Move the comment above the if before, as the erroring out it talks about is there. Done + emit_note_after ((enum insn_note)NOTE_KIND (insn), BB_END (body)); space after 'insn_note)' @@ -173,14 +176,14 @@ doloop_condition_get (rtx_insn *doloop_pat) if (! REG_P (reg)) return 0; - /* Check if something = (plus (reg) (const_int -1)). + /* Check if something = (plus (reg) (const_int -n)). On IA-64, this decrement is wrapped in an if_then_else. */ inc_src = SET_SRC (inc); if (GET_CODE (inc_src) == IF_THEN_ELSE) inc_src = XEXP (inc_src, 1); if (GET_CODE (inc_src) != PLUS || XEXP (inc_src, 0) != reg - || XEXP (inc_src, 1) != constm1_rtx) + || !CONST_INT_P (XEXP (inc_src, 1))) Do we ever check that inc_src is negative? We used to check if it was -1, now we only check it's a constnat, but not a negative one, so I suspect this needs a: || INTVAL (XEXP (inc_src, 1)) >= 0 Good point. Done @@ -492,7 +519,8 @@ doloop_modify (class loop *loop, class niter_desc *desc, case GE: /* Currently only GE tests against zero are supported. */ gcc_assert (XEXP (condition, 1) == const0_rtx); - + /* FALLTHRU */ + case GTU: noloop = constm1_rtx; I spent a very long time staring at this trying to understand why noloop = constm1_rtx for GTU, where I thought it should've been (count & (n-1)). For the current use of doloop it doesn't matter because ARM is the only target using it and you set desc->noloop_assumptions to null_rtx in 'arm_attempt_dlstp_transform' so noloop is never used. However, if a different target accepts this GTU pattern then this target agnostic code will do the wrong thing. I suggest we either: - set noloop to what we think might be the correct value, which if you ask me should be 'count & (XEXP (condition, 1))', - or add a gcc_assert (GET_CODE (condition) != GTU); under the if (desc->noloop_assumption); part and document why. I have a slight preference for the assert given otherwise we are adding code that we can't test. Yea, that's true tbh. I've done the latter, but also separated out the "case GTU:" and added a comment, so that it's more clear that the noloop things aren't used in the only implemented GTU case (Arm) Thank you :) LGTM otherwise (but I don't have the power to approve this ;)). Kind regards, Andre From: Stamatis Markianos-Wright Sent: Thursday, November 16, 2023 11:36 AM To: Stamatis Markianos-Wright via Gcc-patches; Richard Earnshaw; Richard Sandiford; Kyrylo Tkachov Subject: [PING][PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops Pinging back to the top of reviewers' inboxes due to worry about Stage 1 End in a few days :) See the last email for the latest version of the 2/2 patch. The 1/2 patch is A-Ok from Kyrill's earlier target-backend review. On 10/11/2023 12:41, Stamatis Markianos-Wright wrote: On 06/11/2023 17:29, Stamatis Markianos-Wright wrote: On 06/11/2023 11:24, Richard Sandiford wrote: Stamatis Markianos-Wright writes: One of the main reasons for reading the arm bits was to try to answer the question: if we switch to a downcounting loop with a GE condition, how do we make sure that the start value is not a large unsigned number that is interpreted as negative by GE? E.g. if the loop originally counted up in steps of N and used an LTU condition, it could stop at a value in the range [INT_MAX + 1, UINT_MAX]. But the loop might never iterate if we start counting down from most values in that range. Does the patch handle that? So AFAICT this is actually handled in the generic code in `doloop_valid_p`: This kind of loops fail because of they are "desc->infinite", then no loop-doloop conversion is attempted at all (even for standard dls/le loops) Thanks to that check I haven't been able to trigger anything like the behaviour you describe, do you think the doloop_valid_p checks are robust enough? The loops I was thinking of are provably no
Re: [PING][PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops
Sorry for the slow review. Stamatis Markianos-Wright writes: > [...] > diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md > index > 44a04b86cb5806fcf50917826512fd203d42106c..c083f965fa9a40781bc86beb6e63654afd14eac4 > 100644 > --- a/gcc/config/arm/mve.md > +++ b/gcc/config/arm/mve.md > @@ -6922,23 +6922,24 @@ > ;; Originally expanded by 'predicated_doloop_end'. > ;; In the rare situation where the branch is too far, we do also need to > ;; revert FPSCR.LTPSIZE back to 0x100 after the last iteration. > -(define_insn "*predicated_doloop_end_internal" > +(define_insn "predicated_doloop_end_internal" >[(set (pc) > (if_then_else > -(ge (plus:SI (reg:SI LR_REGNUM) > - (match_operand:SI 0 "const_int_operand" "")) > - (const_int 0)) > - (label_ref (match_operand 1 "" "")) > +(gtu (unspec:SI [(plus:SI (match_operand:SI 0 "s_register_operand" > "=r") > + (const_int ))] > + LETP) > + (const_int )) Is there any need for the unspec? I couldn't see why this wasn't simply: (gtu (match_operand:SI 0 "s_register_operand" "=r") (const_int )) But I agree that using gtu rather than ge is nicer if it's what the instruction does. > diff --git a/gcc/df-core.cc b/gcc/df-core.cc > index > d4812b04a7cb97ea1606082e26e910472da5bcc1..4fcc14bf790d43e792b3c926fe1f80073d908c17 > 100644 > --- a/gcc/df-core.cc > +++ b/gcc/df-core.cc > @@ -1964,6 +1964,21 @@ df_bb_regno_last_def_find (basic_block bb, unsigned > int regno) >return NULL; > } > > +/* Return the one and only def of REGNO within BB. If there is no def or > + there are multiple defs, return NULL. */ > + > +df_ref > +df_bb_regno_only_def_find (basic_block bb, unsigned int regno) > +{ > + df_ref temp = df_bb_regno_first_def_find (bb, regno); > + if (!temp) > +return NULL; > + else if (temp == df_bb_regno_last_def_find (bb, regno)) > +return temp; > + else > +return NULL; > +} > + > /* Finds the reference corresponding to the definition of REG in INSN. > DF is the dataflow object. */ > > diff --git a/gcc/df.h b/gcc/df.h > index > 402657a7076f1bcad24e9c50682e033e57f432f9..98623637f9c839c799222e99df2a7173a770b2ac > 100644 > --- a/gcc/df.h > +++ b/gcc/df.h > @@ -987,6 +987,7 @@ extern void df_check_cfg_clean (void); > #endif > extern df_ref df_bb_regno_first_def_find (basic_block, unsigned int); > extern df_ref df_bb_regno_last_def_find (basic_block, unsigned int); > +extern df_ref df_bb_regno_only_def_find (basic_block, unsigned int); > extern df_ref df_find_def (rtx_insn *, rtx); > extern bool df_reg_defined (rtx_insn *, rtx); > extern df_ref df_find_use (rtx_insn *, rtx); > diff --git a/gcc/loop-doloop.cc b/gcc/loop-doloop.cc > index > 4feb0a25ab9331b7124df900f73c9fc6fb3eb10b..d919207505c472c8a54a2c9c982a09061584177b > 100644 > --- a/gcc/loop-doloop.cc > +++ b/gcc/loop-doloop.cc > @@ -85,10 +85,10 @@ doloop_condition_get (rtx_insn *doloop_pat) > forms: > > 1) (parallel [(set (pc) (if_then_else (condition) > - (label_ref (label)) > - (pc))) > - (set (reg) (plus (reg) (const_int -1))) > - (additional clobbers and uses)]) > + (label_ref (label)) > + (pc))) > + (set (reg) (plus (reg) (const_int -1))) > + (additional clobbers and uses)]) > > The branch must be the first entry of the parallel (also required > by jump.cc), and the second entry of the parallel must be a set of > @@ -96,19 +96,34 @@ doloop_condition_get (rtx_insn *doloop_pat) > the loop counter in an if_then_else too. > > 2) (set (reg) (plus (reg) (const_int -1)) > - (set (pc) (if_then_else (reg != 0) > - (label_ref (label)) > - (pc))). > + (set (pc) (if_then_else (reg != 0) > + (label_ref (label)) > + (pc))). > > Some targets (ARM) do the comparison before the branch, as in the > following form: > > - 3) (parallel [(set (cc) (compare ((plus (reg) (const_int -1), 0))) > - (set (reg) (plus (reg) (const_int -1)))]) > -(set (pc) (if_then_else (cc == NE) > -(label_ref (label)) > -(pc))) */ > - > + 3) (parallel [(set (cc) (compare (plus (reg) (const_int -1)) 0)) > +(set (reg) (plus (reg) (const_int -1)))]) > + (set (pc) (if_then_else (cc == NE) > + (label_ref (label)) > + (pc))) > + > + The ARM target also supports a special case of a counter that > decrements > + by `n` and terminating in a GTU condition. In that case, the compare > and > +
Re: [PING][PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops
Thanks for addressing my comments. I have reviewed this and the other patch before and they LGTM. I however do not have approval rights so you will need the OK from a maintainer. Thanks for doing this :) Andre On 30/11/2023 12:55, Stamatis Markianos-Wright wrote: Hi Andre, Thanks for the comments, see latest revision attached. On 27/11/2023 12:47, Andre Vieira (lists) wrote: Hi Stam, Just some comments. +/* Recursively scan through the DF chain backwards within the basic block and + determine if any of the USEs of the original insn (or the USEs of the insns s/Recursively scan/Scan/ as you no longer recurse, thanks for that by the way :) + where thy were DEF-ed, etc., recursively) were affected by implicit VPT remove recursively for the same reasons. + if (!CONST_INT_P (cond_counter_iv.step) || !CONST_INT_P (cond_temp_iv.step)) + return NULL; + /* Look at the steps and swap around the rtx's if needed. Error out if + one of them cannot be identified as constant. */ + if (INTVAL (cond_counter_iv.step) != 0 && INTVAL (cond_temp_iv.step) != 0) + return NULL; Move the comment above the if before, as the erroring out it talks about is there. Done + emit_note_after ((enum insn_note)NOTE_KIND (insn), BB_END (body)); space after 'insn_note)' @@ -173,14 +176,14 @@ doloop_condition_get (rtx_insn *doloop_pat) if (! REG_P (reg)) return 0; - /* Check if something = (plus (reg) (const_int -1)). + /* Check if something = (plus (reg) (const_int -n)). On IA-64, this decrement is wrapped in an if_then_else. */ inc_src = SET_SRC (inc); if (GET_CODE (inc_src) == IF_THEN_ELSE) inc_src = XEXP (inc_src, 1); if (GET_CODE (inc_src) != PLUS || XEXP (inc_src, 0) != reg - || XEXP (inc_src, 1) != constm1_rtx) + || !CONST_INT_P (XEXP (inc_src, 1))) Do we ever check that inc_src is negative? We used to check if it was -1, now we only check it's a constnat, but not a negative one, so I suspect this needs a: || INTVAL (XEXP (inc_src, 1)) >= 0 Good point. Done @@ -492,7 +519,8 @@ doloop_modify (class loop *loop, class niter_desc *desc, case GE: /* Currently only GE tests against zero are supported. */ gcc_assert (XEXP (condition, 1) == const0_rtx); - + /* FALLTHRU */ + case GTU: noloop = constm1_rtx; I spent a very long time staring at this trying to understand why noloop = constm1_rtx for GTU, where I thought it should've been (count & (n-1)). For the current use of doloop it doesn't matter because ARM is the only target using it and you set desc->noloop_assumptions to null_rtx in 'arm_attempt_dlstp_transform' so noloop is never used. However, if a different target accepts this GTU pattern then this target agnostic code will do the wrong thing. I suggest we either: - set noloop to what we think might be the correct value, which if you ask me should be 'count & (XEXP (condition, 1))', - or add a gcc_assert (GET_CODE (condition) != GTU); under the if (desc->noloop_assumption); part and document why. I have a slight preference for the assert given otherwise we are adding code that we can't test. Yea, that's true tbh. I've done the latter, but also separated out the "case GTU:" and added a comment, so that it's more clear that the noloop things aren't used in the only implemented GTU case (Arm) Thank you :) LGTM otherwise (but I don't have the power to approve this ;)). Kind regards, Andre From: Stamatis Markianos-Wright Sent: Thursday, November 16, 2023 11:36 AM To: Stamatis Markianos-Wright via Gcc-patches; Richard Earnshaw; Richard Sandiford; Kyrylo Tkachov Subject: [PING][PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops Pinging back to the top of reviewers' inboxes due to worry about Stage 1 End in a few days :) See the last email for the latest version of the 2/2 patch. The 1/2 patch is A-Ok from Kyrill's earlier target-backend review. On 10/11/2023 12:41, Stamatis Markianos-Wright wrote: On 06/11/2023 17:29, Stamatis Markianos-Wright wrote: On 06/11/2023 11:24, Richard Sandiford wrote: Stamatis Markianos-Wright writes: One of the main reasons for reading the arm bits was to try to answer the question: if we switch to a downcounting loop with a GE condition, how do we make sure that the start value is not a large unsigned number that is interpreted as negative by GE? E.g. if the loop originally counted up in steps of N and used an LTU condition, it could stop at a value in the range [INT_MAX + 1, UINT_MAX]. But the loop might never iterate if we start counting down from most values in that range. Does the patch handle that? So AFAICT this is actually handled in the generic code in `doloop_valid_p`: This kind of loops fail because of they are "desc->infinite", then no loop-doloop conversion is attempted at all (even for standa
Re: [PING][PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops
Hi Stam, Just some comments. +/* Recursively scan through the DF chain backwards within the basic block and + determine if any of the USEs of the original insn (or the USEs of the insns s/Recursively scan/Scan/ as you no longer recurse, thanks for that by the way :) + where thy were DEF-ed, etc., recursively) were affected by implicit VPT remove recursively for the same reasons. + if (!CONST_INT_P (cond_counter_iv.step) || !CONST_INT_P (cond_temp_iv.step)) + return NULL; + /* Look at the steps and swap around the rtx's if needed. Error out if +one of them cannot be identified as constant. */ + if (INTVAL (cond_counter_iv.step) != 0 && INTVAL (cond_temp_iv.step) != 0) + return NULL; Move the comment above the if before, as the erroring out it talks about is there. + emit_note_after ((enum insn_note)NOTE_KIND (insn), BB_END (body)); space after 'insn_note)' @@ -173,14 +176,14 @@ doloop_condition_get (rtx_insn *doloop_pat) if (! REG_P (reg)) return 0; - /* Check if something = (plus (reg) (const_int -1)). + /* Check if something = (plus (reg) (const_int -n)). On IA-64, this decrement is wrapped in an if_then_else. */ inc_src = SET_SRC (inc); if (GET_CODE (inc_src) == IF_THEN_ELSE) inc_src = XEXP (inc_src, 1); if (GET_CODE (inc_src) != PLUS || XEXP (inc_src, 0) != reg - || XEXP (inc_src, 1) != constm1_rtx) + || !CONST_INT_P (XEXP (inc_src, 1))) Do we ever check that inc_src is negative? We used to check if it was -1, now we only check it's a constnat, but not a negative one, so I suspect this needs a: || INTVAL (XEXP (inc_src, 1)) >= 0 @@ -492,7 +519,8 @@ doloop_modify (class loop *loop, class niter_desc *desc, case GE: /* Currently only GE tests against zero are supported. */ gcc_assert (XEXP (condition, 1) == const0_rtx); - + /* FALLTHRU */ +case GTU: noloop = constm1_rtx; I spent a very long time staring at this trying to understand why noloop = constm1_rtx for GTU, where I thought it should've been (count & (n-1)). For the current use of doloop it doesn't matter because ARM is the only target using it and you set desc->noloop_assumptions to null_rtx in 'arm_attempt_dlstp_transform' so noloop is never used. However, if a different target accepts this GTU pattern then this target agnostic code will do the wrong thing. I suggest we either: - set noloop to what we think might be the correct value, which if you ask me should be 'count & (XEXP (condition, 1))', - or add a gcc_assert (GET_CODE (condition) != GTU); under the if (desc->noloop_assumption); part and document why. I have a slight preference for the assert given otherwise we are adding code that we can't test. LGTM otherwise (but I don't have the power to approve this ;)). Kind regards, Andre From: Stamatis Markianos-Wright Sent: Thursday, November 16, 2023 11:36 AM To: Stamatis Markianos-Wright via Gcc-patches; Richard Earnshaw; Richard Sandiford; Kyrylo Tkachov Subject: [PING][PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops Pinging back to the top of reviewers' inboxes due to worry about Stage 1 End in a few days :) See the last email for the latest version of the 2/2 patch. The 1/2 patch is A-Ok from Kyrill's earlier target-backend review. On 10/11/2023 12:41, Stamatis Markianos-Wright wrote: On 06/11/2023 17:29, Stamatis Markianos-Wright wrote: On 06/11/2023 11:24, Richard Sandiford wrote: Stamatis Markianos-Wright writes: One of the main reasons for reading the arm bits was to try to answer the question: if we switch to a downcounting loop with a GE condition, how do we make sure that the start value is not a large unsigned number that is interpreted as negative by GE? E.g. if the loop originally counted up in steps of N and used an LTU condition, it could stop at a value in the range [INT_MAX + 1, UINT_MAX]. But the loop might never iterate if we start counting down from most values in that range. Does the patch handle that? So AFAICT this is actually handled in the generic code in `doloop_valid_p`: This kind of loops fail because of they are "desc->infinite", then no loop-doloop conversion is attempted at all (even for standard dls/le loops) Thanks to that check I haven't been able to trigger anything like the behaviour you describe, do you think the doloop_valid_p checks are robust enough? The loops I was thinking of are provably not infinite though. E.g.: for (unsigned int i = 0; i < UINT_MAX - 100; ++i) ... is known to terminate. And doloop conversion is safe with the normal count-down-by-1 approach, so I don't think current code would need to reject it. I.e. a conversion to: unsigned int i = UINT_MAX - 101; do ... while (--i != ~0U); would be safe, but a conversion to: int i = UINT_MAX - 101; do ...
[PING][PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops
Pinging back to the top of reviewers' inboxes due to worry about Stage 1 End in a few days :) See the last email for the latest version of the 2/2 patch. The 1/2 patch is A-Ok from Kyrill's earlier target-backend review. On 10/11/2023 12:41, Stamatis Markianos-Wright wrote: On 06/11/2023 17:29, Stamatis Markianos-Wright wrote: On 06/11/2023 11:24, Richard Sandiford wrote: Stamatis Markianos-Wright writes: One of the main reasons for reading the arm bits was to try to answer the question: if we switch to a downcounting loop with a GE condition, how do we make sure that the start value is not a large unsigned number that is interpreted as negative by GE? E.g. if the loop originally counted up in steps of N and used an LTU condition, it could stop at a value in the range [INT_MAX + 1, UINT_MAX]. But the loop might never iterate if we start counting down from most values in that range. Does the patch handle that? So AFAICT this is actually handled in the generic code in `doloop_valid_p`: This kind of loops fail because of they are "desc->infinite", then no loop-doloop conversion is attempted at all (even for standard dls/le loops) Thanks to that check I haven't been able to trigger anything like the behaviour you describe, do you think the doloop_valid_p checks are robust enough? The loops I was thinking of are provably not infinite though. E.g.: for (unsigned int i = 0; i < UINT_MAX - 100; ++i) ... is known to terminate. And doloop conversion is safe with the normal count-down-by-1 approach, so I don't think current code would need to reject it. I.e. a conversion to: unsigned int i = UINT_MAX - 101; do ... while (--i != ~0U); would be safe, but a conversion to: int i = UINT_MAX - 101; do ... while ((i -= step, i > 0)); wouldn't, because the loop body would only be executed once. I'm only going off the name "infinite" though :) It's possible that it has more connotations than that. Thanks, Richard Ack, yep, I see what you mean now, and yep, that kind of loop does indeed pass through doloop_valid_p Interestingly , in the v8-M Arm ARM this is done with: ``` boolean IsLastLowOverheadLoop(INSTR_EXEC_STATE_Type state) // This does not check whether a loop is currently active. // If the PE were in a loop, would this be the last one? return UInt(state.LoopCount) <= (1 << (4 - LTPSIZE)); ``` So architecturally the asm we output would be ok (except maybe the "branch too far subs;bgt;lctp" fallback at `predicated_doloop_end_internal` (maybe that should be `bhi`))... But now GE: isn't looking like an accurate representation of this operation in the compiler. I'm wondering if I should try to make `predicated_doloop_end_internal` contain a comparison along the lines of: (gtu: (plus: (LR) (const_int -num_lanes)) (const_int num_lanes_minus_1)) I'll give that a try :) The only reason I'd chosen to go with GE earlier, tbh, was because of the existing handling of GE in loop-doloop.cc Let me know if any other ideas come to your mind! Cheers, Stam It looks like I've had success with the below (diff to previous patch), trimmed a bit to only the functionally interesting things:: diff --git a/gcc/config/arm/thumb2.md b/gcc/config/arm/thumb2.md index 368d5138ca1..54dd4ee564b 100644 --- a/gcc/config/arm/thumb2.md +++ b/gcc/config/arm/thumb2.md @@ -1649,16 +1649,28 @@ && (decrement_num = arm_attempt_dlstp_transform (operands[1])) && (INTVAL (decrement_num) != 1)) { - insn = emit_insn - (gen_thumb2_addsi3_compare0 - (s0, s0, GEN_INT ((-1) * (INTVAL (decrement_num); - cmp = XVECEXP (PATTERN (insn), 0, 0); - cc_reg = SET_DEST (cmp); - bcomp = gen_rtx_GE (VOIDmode, cc_reg, const0_rtx); loc_ref = gen_rtx_LABEL_REF (VOIDmode, operands[1]); - emit_jump_insn (gen_rtx_SET (pc_rtx, - gen_rtx_IF_THEN_ELSE (VOIDmode, bcomp, - loc_ref, pc_rtx))); + switch (INTVAL (decrement_num)) + { + case 2: + insn = emit_jump_insn (gen_predicated_doloop_end_internal2 + (s0, loc_ref)); + break; + case 4: + insn = emit_jump_insn (gen_predicated_doloop_end_internal4 + (s0, loc_ref)); + break; + case 8: + insn = emit_jump_insn (gen_predicated_doloop_end_internal8 + (s0, loc_ref)); + break; + case 16: + insn = emit_jump_insn (gen_predicated_doloop_end_internal16 + (s0, loc_ref)); + break; + default: + gcc_unreachable (); + } DONE; } } diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md index 93905583b18..c083f965fa9 100644 --- a/gcc/config/arm/mve.md +++ b/gcc/config/arm/mve.md @@ -6922,23 +6922,24 @@ ;;
Re: [PING][PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops
On 06/11/2023 11:24, Richard Sandiford wrote: Stamatis Markianos-Wright writes: One of the main reasons for reading the arm bits was to try to answer the question: if we switch to a downcounting loop with a GE condition, how do we make sure that the start value is not a large unsigned number that is interpreted as negative by GE? E.g. if the loop originally counted up in steps of N and used an LTU condition, it could stop at a value in the range [INT_MAX + 1, UINT_MAX]. But the loop might never iterate if we start counting down from most values in that range. Does the patch handle that? So AFAICT this is actually handled in the generic code in `doloop_valid_p`: This kind of loops fail because of they are "desc->infinite", then no loop-doloop conversion is attempted at all (even for standard dls/le loops) Thanks to that check I haven't been able to trigger anything like the behaviour you describe, do you think the doloop_valid_p checks are robust enough? The loops I was thinking of are provably not infinite though. E.g.: for (unsigned int i = 0; i < UINT_MAX - 100; ++i) ... is known to terminate. And doloop conversion is safe with the normal count-down-by-1 approach, so I don't think current code would need to reject it. I.e. a conversion to: unsigned int i = UINT_MAX - 101; do ... while (--i != ~0U); would be safe, but a conversion to: int i = UINT_MAX - 101; do ... while ((i -= step, i > 0)); wouldn't, because the loop body would only be executed once. I'm only going off the name "infinite" though :) It's possible that it has more connotations than that. Thanks, Richard Ack, yep, I see what you mean now, and yep, that kind of loop does indeed pass through doloop_valid_p Interestingly , in the v8-M Arm ARM this is done with: ``` boolean IsLastLowOverheadLoop(INSTR_EXEC_STATE_Type state) // This does not check whether a loop is currently active. // If the PE were in a loop, would this be the last one? return UInt(state.LoopCount) <= (1 << (4 - LTPSIZE)); ``` So architecturally the asm we output would be ok (except maybe the "branch too far subs;bgt;lctp" fallback at `predicated_doloop_end_internal` (maybe that should be `bhi`))... But now GE: isn't looking like an accurate representation of this operation in the compiler. I'm wondering if I should try to make `predicated_doloop_end_internal` contain a comparison along the lines of: (gtu: (plus: (LR) (const_int -num_lanes)) (const_int num_lanes_minus_1)) I'll give that a try :) The only reason I'd chosen to go with GE earlier, tbh, was because of the existing handling of GE in loop-doloop.cc Let me know if any other ideas come to your mind! Cheers, Stam
Re: [PING][PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops
Stamatis Markianos-Wright writes: >> One of the main reasons for reading the arm bits was to try to answer >> the question: if we switch to a downcounting loop with a GE condition, >> how do we make sure that the start value is not a large unsigned >> number that is interpreted as negative by GE? E.g. if the loop >> originally counted up in steps of N and used an LTU condition, >> it could stop at a value in the range [INT_MAX + 1, UINT_MAX]. >> But the loop might never iterate if we start counting down from >> most values in that range. >> >> Does the patch handle that? > > So AFAICT this is actually handled in the generic code in `doloop_valid_p`: > > This kind of loops fail because of they are "desc->infinite", then no > loop-doloop conversion is attempted at all (even for standard dls/le loops) > > Thanks to that check I haven't been able to trigger anything like the > behaviour you describe, do you think the doloop_valid_p checks are > robust enough? The loops I was thinking of are provably not infinite though. E.g.: for (unsigned int i = 0; i < UINT_MAX - 100; ++i) ... is known to terminate. And doloop conversion is safe with the normal count-down-by-1 approach, so I don't think current code would need to reject it. I.e. a conversion to: unsigned int i = UINT_MAX - 101; do ... while (--i != ~0U); would be safe, but a conversion to: int i = UINT_MAX - 101; do ... while ((i -= step, i > 0)); wouldn't, because the loop body would only be executed once. I'm only going off the name "infinite" though :) It's possible that it has more connotations than that. Thanks, Richard
Re: [PING][PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops
Sorry for the slow review. I had a look at the arm bits too, to get some context for the target-independent bits. Stamatis Markianos-Wright via Gcc-patches writes: > [...] > diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h > index 77e76336e94..74186930f0b 100644 > --- a/gcc/config/arm/arm-protos.h > +++ b/gcc/config/arm/arm-protos.h > @@ -65,8 +65,8 @@ extern void arm_emit_speculation_barrier_function (void); > extern void arm_decompose_di_binop (rtx, rtx, rtx *, rtx *, rtx *, rtx *); > extern bool arm_q_bit_access (void); > extern bool arm_ge_bits_access (void); > -extern bool arm_target_insn_ok_for_lob (rtx); > - > +extern bool arm_target_bb_ok_for_lob (basic_block); > +extern rtx arm_attempt_dlstp_transform (rtx); > #ifdef RTX_CODE > enum reg_class > arm_mode_base_reg_class (machine_mode); > diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc > index 6e933c80183..39d97ba5e4d 100644 > --- a/gcc/config/arm/arm.cc > +++ b/gcc/config/arm/arm.cc > @@ -659,6 +659,12 @@ static const struct attribute_spec arm_attribute_table[] > [...] > +/* Wrapper function of arm_get_required_vpr_reg with TYPE == 1, so return > + something only if the VPR reg is an input operand to the insn. */ > + > +static rtx > +ALWAYS_INLINE Probably best to leave out the ALWAYS_INLINE. That's generally only appropriate for things that need to be inlined for correctness. > +arm_get_required_vpr_reg_param (rtx_insn *insn) > +{ > + return arm_get_required_vpr_reg (insn, 1); > +} > [...] > +/* Recursively scan through the DF chain backwards within the basic block and > + determine if any of the USEs of the original insn (or the USEs of the > insns > + where thy were DEF-ed, etc., recursively) were affected by implicit VPT > + predication of an MVE_VPT_UNPREDICATED_INSN_P in a dlstp/letp loop. > + This function returns true if the insn is affected implicit predication > + and false otherwise. > + Having such implicit predication on an unpredicated insn wouldn't in > itself > + block tail predication, because the output of that insn might then be used > + in a correctly predicated store insn, where the disabled lanes will be > + ignored. To verify this we later call: > + `arm_mve_check_df_chain_fwd_for_implic_predic_impact`, which will check > the > + DF chains forward to see if any implicitly-predicated operand gets used in > + an improper way. */ > + > +static bool > +arm_mve_check_df_chain_back_for_implic_predic > + (hash_map, bool>* safe_insn_map, rtx_insn *insn, > + rtx vctp_vpr_generated) > +{ > + bool* temp = NULL; > + if ((temp = safe_insn_map->get (INSN_UID (insn > +return *temp; > + > + basic_block body = BLOCK_FOR_INSN (insn); > + /* The circumstances under which an instruction is affected by "implicit > + predication" are as follows: > + * It is an UNPREDICATED_INSN_P: > + * That loads/stores from/to memory. > + * Where any one of its operands is an MVE vector from outside the > + loop body bb. > + Or: > + * Any of it's operands, recursively backwards, are affected. */ > + if (MVE_VPT_UNPREDICATED_INSN_P (insn) > + && (arm_is_mve_load_store_insn (insn) > + || (arm_is_mve_across_vector_insn (insn) > + && !arm_mve_is_allowed_unpredic_across_vector_insn (insn > +{ > + safe_insn_map->put (INSN_UID (insn), true); > + return true; > +} > + > + df_ref insn_uses = NULL; > + FOR_EACH_INSN_USE (insn_uses, insn) > + { > +/* If the operand is in the input reg set to the the basic block, > + (i.e. it has come from outside the loop!), consider it unsafe if: > + * It's being used in an unpredicated insn. > + * It is a predicable MVE vector. */ > +if (MVE_VPT_UNPREDICATED_INSN_P (insn) > + && VALID_MVE_MODE (GET_MODE (DF_REF_REG (insn_uses))) > + && REGNO_REG_SET_P (DF_LR_IN (body), DF_REF_REGNO (insn_uses))) > + { > + safe_insn_map->put (INSN_UID (insn), true); > + return true; > + } > +/* Scan backwards from the current INSN through the instruction chain > + until the start of the basic block. */ > +for (rtx_insn *prev_insn = PREV_INSN (insn); > + prev_insn && prev_insn != PREV_INSN (BB_HEAD (body)); > + prev_insn = PREV_INSN (prev_insn)) > + { > + /* If a previous insn defines a register that INSN uses, then recurse > +in order to check that insn's USEs. > +If any of these insns return true as MVE_VPT_UNPREDICATED_INSN_Ps, > +then the whole chain is affected by the change in behaviour from > +being placed in dlstp/letp loop. */ > + df_ref prev_insn_defs = NULL; > + FOR_EACH_INSN_DEF (prev_insn_defs, prev_insn) > + { > + if (DF_REF_REGNO (insn_uses) == DF_REF_REGNO (prev_insn_defs) > + && !arm_mve_vec_insn_is_predicated_with_this_predicate > +(insn, vctp_vpr_generated) > + &&
Re: [PING][PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops
Ping for Jeff or another global maintainer to review the target agnostic bits of this, that's: loop-doloop.cc df-core.{c,h} I do have a nitpick myself that I missed last time around: /* We expect the condition to be of the form (reg != 0) */ cond = XEXP (SET_SRC (cmp), 0); - if (GET_CODE (cond) != NE || XEXP (cond, 1) != const0_rtx) + if ((GET_CODE (cond) != NE && GET_CODE (cond) != GE) + || XEXP (cond, 1) != const0_rtx) return 0; } Could do with updating the comment to reflect allowing >= now. But happy for you to change this once approved by a maintainer. Kind regards, Andre On 11/10/2023 12:34, Stamatis Markianos-Wright wrote: Hi all, On 28/09/2023 13:51, Andre Vieira (lists) wrote: Hi, On 14/09/2023 13:10, Kyrylo Tkachov via Gcc-patches wrote: Hi Stam, The arm parts look sensible but we'd need review for the df-core.h and df-core.cc changes. Maybe Jeff can help or can recommend someone to take a look? Just thought I'd do a follow-up "ping" on this :) Thanks, Kyrill FWIW the changes LGTM, if we don't want these in df-core we can always implement the extra utility locally. It's really just a helper function to check if df_bb_regno_first_def_find and df_bb_regno_last_def_find yield the same result, meaning we only have a single definition. Kind regards, Andre Thanks, Stam
Re: [PING][PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops
Hi all, On 28/09/2023 13:51, Andre Vieira (lists) wrote: Hi, On 14/09/2023 13:10, Kyrylo Tkachov via Gcc-patches wrote: Hi Stam, The arm parts look sensible but we'd need review for the df-core.h and df-core.cc changes. Maybe Jeff can help or can recommend someone to take a look? Just thought I'd do a follow-up "ping" on this :) Thanks, Kyrill FWIW the changes LGTM, if we don't want these in df-core we can always implement the extra utility locally. It's really just a helper function to check if df_bb_regno_first_def_find and df_bb_regno_last_def_find yield the same result, meaning we only have a single definition. Kind regards, Andre Thanks, Stam
Re: [PING][PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops
Hi, On 14/09/2023 13:10, Kyrylo Tkachov via Gcc-patches wrote: Hi Stam, The arm parts look sensible but we'd need review for the df-core.h and df-core.cc changes. Maybe Jeff can help or can recommend someone to take a look? Thanks, Kyrill FWIW the changes LGTM, if we don't want these in df-core we can always implement the extra utility locally. It's really just a helper function to check if df_bb_regno_first_def_find and df_bb_regno_last_def_find yield the same result, meaning we only have a single definition. Kind regards, Andre
RE: [PING][PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops
Hi Stam, > -Original Message- > From: Stam Markianos-Wright > Sent: Wednesday, September 6, 2023 6:19 PM > To: gcc-patches@gcc.gnu.org > Cc: Kyrylo Tkachov ; Richard Earnshaw > > Subject: [PING][PATCH 2/2] arm: Add support for MVE Tail-Predicated Low > Overhead Loops > > Hi all, > > This is the 2/2 patch that contains the functional changes needed > for MVE Tail Predicated Low Overhead Loops. See my previous email > for a general introduction of MVE LOLs. > > This support is added through the already existing loop-doloop > mechanisms that are used for non-MVE dls/le looping. > > Mid-end changes are: > > 1) Relax the loop-doloop mechanism in the mid-end to allow for > decrement numbers other that -1 and for `count` to be an > rtx containing a simple REG (which in this case will contain > the number of elements to be processed), rather > than an expression for calculating the number of iterations. > 2) Added a new df utility function: `df_bb_regno_only_def_find` that > will return the DEF of a REG if it is DEF-ed only once within the > basic block. > > And many things in the backend to implement the above optimisation: > > 3) Implement the `arm_predict_doloop_p` target hook to instruct the > mid-end about Low Overhead Loops (MVE or not), as well as > `arm_loop_unroll_adjust` which will prevent unrolling of any loops > that are valid for becoming MVE Tail_Predicated Low Overhead Loops > (unrolling can transform a loop in ways that invalidate the dlstp/ > letp tranformation logic and the benefit of the dlstp/letp loop > would be considerably higher than that of unrolling) > 4) Appropriate changes to the define_expand of doloop_end, new > patterns for dlstp and letp, new iterators, unspecs, etc. > 5) `arm_mve_loop_valid_for_dlstp` and a number of checking functions: > * `arm_mve_dlstp_check_dec_counter` > * `arm_mve_dlstp_check_inc_counter` > * `arm_mve_check_reg_origin_is_num_elems` > * `arm_mve_check_df_chain_back_for_implic_predic` > * `arm_mve_check_df_chain_fwd_for_implic_predic_impact` > This all, in smoe way or another, are running checks on the loop > structure in order to determine if the loop is valid for dlstp/letp > transformation. > 6) `arm_attempt_dlstp_transform`: (called from the define_expand of > doloop_end) this function re-checks for the loop's suitability for > dlstp/letp transformation and then implements it, if possible. > 7) Various utility functions: > *`arm_mve_get_vctp_lanes` to map > from vctp unspecs to number of lanes, and `arm_get_required_vpr_reg` > to check an insn to see if it requires the VPR or not. > * `arm_mve_get_loop_vctp` > * `arm_mve_get_vctp_lanes` > * `arm_emit_mve_unpredicated_insn_to_seq` > * `arm_get_required_vpr_reg` > * `arm_get_required_vpr_reg_param` > * `arm_get_required_vpr_reg_ret_val` > * `arm_mve_is_across_vector_insn` > * `arm_is_mve_load_store_insn` > * `arm_mve_vec_insn_is_predicated_with_this_predicate` > * `arm_mve_vec_insn_is_unpredicated_or_uses_other_predicate` > > No regressions on arm-none-eabi with various targets and on > aarch64-none-elf. Thoughts on getting this into trunk? The arm parts look sensible but we'd need review for the df-core.h and df-core.cc changes. Maybe Jeff can help or can recommend someone to take a look? Thanks, Kyrill > > Thank you, > Stam Markianos-Wright > > gcc/ChangeLog: > > * config/arm/arm-protos.h (arm_target_insn_ok_for_lob): Rename to... > (arm_target_bb_ok_for_lob): ...this > (arm_attempt_dlstp_transform): New. > * config/arm/arm.cc (TARGET_LOOP_UNROLL_ADJUST): New. > (TARGET_PREDICT_DOLOOP_P): New. > (arm_block_set_vect): > (arm_target_insn_ok_for_lob): Rename from arm_target_insn_ok_for_lob. > (arm_target_bb_ok_for_lob): New. > (arm_mve_get_vctp_lanes): New. > (arm_get_required_vpr_reg): New. > (arm_get_required_vpr_reg_param): New. > (arm_get_required_vpr_reg_ret_val): New. > (arm_mve_get_loop_vctp): New. > (arm_mve_vec_insn_is_unpredicated_or_uses_other_predicate): New. > (arm_mve_vec_insn_is_predicated_with_this_predicate): New. > (arm_mve_check_df_chain_back_for_implic_predic): New. > (arm_mve_check_df_chain_fwd_for_implic_predic_impact): New. > (arm_mve_check_reg_origin_is_num_elems): New. > (arm_mve_dlstp_check_inc_counter): New. > (arm_mve_dlstp_check_dec_counter): New. > (arm_mve_loop_valid_for_dlstp): New. > (arm_mve_is_across_vector_insn): New. > (arm_is_mve_load_store_insn): New. > (arm_predict_doloop_p): Ne
Re: [PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops
On 23/06/2023 11:23, Andre Vieira (lists) wrote: + if (insn != arm_mve_get_loop_vctp (body)) + { probably a good idea to invert the condition here and return false, helps reducing the indenting in this function. Done, thanks + /* Starting from the current insn, scan backwards through the insn + chain until BB_HEAD: "for each insn in the BB prior to the current". + */ There's a trailing whitespace after insn, but also I'd rewrite this bit. The "for each insn in the BB prior to the current" is superfluous and even confusing to me. How about: "Scan backwards from the current INSN through the instruction chain until the start of the basic block. " Yes, agreed, it wasn't very clear. Done. I find 'that previous insn' to be confusing as you don't mention any previous insn before. So how about something along the lines of: 'If a previous insn defines a register that INSN uses then return true if...' Done Do we need to check: 'insn != prev_insn' ? Any reason why you can't start the loop with: 'for (rtx_insn *prev_insn = PREV_INSN (insn);' True! Done. Now I also found a case where things might go wrong in: + /* Look at all the DEFs of that previous insn: if one of them is on + the same REG as our current insn, then recurse in order to check + that insn's USEs. If any of these insns return true as + MVE_VPT_UNPREDICATED_INSN_Ps, then the whole chain is affected + by the change in behaviour from being placed in dlstp/letp loop. + */ + df_ref prev_insn_defs = NULL; + FOR_EACH_INSN_DEF (prev_insn_defs, prev_insn) + { + if (DF_REF_REGNO (insn_uses) == DF_REF_REGNO (prev_insn_defs) + && insn != prev_insn + && body == BLOCK_FOR_INSN (prev_insn) + && !arm_mve_vec_insn_is_predicated_with_this_predicate + (insn, vctp_vpr_generated) + && arm_mve_check_df_chain_back_for_implic_predic + (prev_insn, vctp_vpr_generated)) + return true; + } The body == BLOCK_FOR_INSN (prev_insn) hinted me at it, if a def comes from outside of the BB (so outside of the loop's body) then its by definition unpredicated by vctp. I think you want to check that if prev_insn defines a register used by insn then return true if prev_insn isn't in the same BB or has a chain that is not predicated, i.e.: '!arm_mve_vec_insn_is_predicated_with_this_predicate (insn, vctp_vpr_generated) && arm_mve_check_df_chain_back_for_implic_predic prev_insn, vctp_vpr_generated))' you check body != BLOCK_FOR_INSN (prev_insn)' Yes, you're right, this is vulnerable here. A neater fix to this (I think?) is to make the above REGNO_REG_SET_P more generic, so that it covers all scalar values and scalar ops, as well. Then it's a "if this insn in the loop has any input that originates outside the bb, then it's unsafe" check and the recursive loop backwards is only for the recursive "are any previous insns unsafe" I also found some other issues, this currently loloops: uint16_t test (uint16_t *a, int n) { uint16_t res =0; while (n > 0) { mve_pred16_t p = vctp16q (n); uint16x8_t va = vldrhq_u16 (a); res = vaddvaq_u16 (res, va); res = vaddvaq_p_u16 (res, va, p); a += 8; n -= 8; } return res; } But it shouldn't, this is because there's a lack of handling of across vector instructions. Luckily in MVE all across vector instructions have the side-effect that they write to a scalar register, even the vshlcq instruction (it writes to a scalar carry output). Added support for them (you were right, there was some special handling needed!) Did this lead me to find an ICE with: uint16x8_t test (uint16_t *a, int n) { uint16x8_t res = vdupq_n_u16 (0); while (n > 0) { uint16_t carry = 0; mve_pred16_t p = vctp16q (n); uint16x8_t va = vldrhq_u16 (a); res = vshlcq_u16 (va, , 1); res = vshlcq_m_u16 (res, , 1 , p); a += 8; n -= 8; } return res; } This is because: + /* If the USE is outside the loop body bb, or it is inside, but + is an unpredicated store to memory. */ + if (BLOCK_FOR_INSN (insn) != BLOCK_FOR_INSN (next_use_insn) + || (arm_mve_vec_insn_is_unpredicated_or_uses_other_predicate + (next_use_insn, vctp_vpr_generated) + && mve_memory_operand + (SET_DEST (single_set (next_use_insn)), + GET_MODE (SET_DEST (single_set (next_use_insn)) + return true; Assumes single_set doesn't return 0. Thanks! That is indeed correct. Corrected this by having a utility function to scan insn operands and check against mve_memory_operand that supports any number of operands/SETs in the insn Let's deal with these issues and I'll continue to review. On 15/06/2023 12:47, Stamatis Markianos-Wright via Gcc-patches wrote: Hi all, This is
Re: [PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops
Thank you Andre for reviewing! I'll attach the updated version of the patch to the third review email (your final one thus far ;) On 22/06/2023 16:54, Andre Vieira (lists) wrote: Some comments below, all quite minor. I'll continue to review tomorrow, I need a fresher brain for arm_mve_check_df_chain_back_for_implic_predic ;) +static int +arm_mve_get_vctp_lanes (rtx x) +{ + if (GET_CODE (x) == SET && GET_CODE (XEXP (x, 1)) == UNSPEC + && (XINT (XEXP (x, 1), 1) == VCTP || XINT (XEXP (x, 1), 1) == VCTP_M)) + { + switch (GET_MODE (XEXP (x, 1))) + { + case V16BImode: + return 16; + case V8BImode: + return 8; + case V4BImode: + return 4; + case V2QImode: + return 2; + default: + break; + } + } + return 0; +} I think you can replace the switch with something along the lines of: machine_mode mode = GET_MODE (XEXP (x, 1)); return VECTOR_MODE_P (mode) ? GET_MODE_NUNITS (mode) : 0; Ah true, especially now that there are no HImode predicates! I added an additional check of `&& VALID_MVE_PRED_MODE (mode)` as well, just to make sure we could never pick up V4SImode, etc. (although I'd never expect that to happen if `rtx x` came from a valid instruction) +/* Check if an insn requires the use of the VPR_REG, if it does, return the + sub-rtx of the VPR_REG. The `type` argument controls whether + this function should: + * For type == 0, check all operands, including the OUT operands, + and return the first occurance of the VPR_REG. s/occurance/occurrence/ Done + bool requires_vpr; + extract_constrain_insn (insn); indent of requires_vpr is off. Done + if (type == 1 && (recog_data.operand_type[op] == OP_OUT + || recog_data.operand_type[op] == OP_INOUT)) + continue; + else if (type == 2 && (recog_data.operand_type[op] == OP_IN + || recog_data.operand_type[op] == OP_INOUT)) + continue; Why skip INOUT? I guess this will become clear when I see the uses, but I'm wondering whether 'only check the input operands.' is clear enough. Maybe 'check operands that are input only.' would be more accurate? Oh! Thanks for spotting this. It also doesn't work with my comment at the top: `(INOUT operands are considered both as input and output operands)` It's been a long time since I wrote this piece, but it might be that I added this after realising that there are no insns with an OP_INOUT VPR reg. Since I don't think it's functional, I changed the code to align with the comment, instead. + /* Fetch the reg_class for each entry and check it against the + * VPR_REG reg_class. */ Remove leading * on the second line. Damn auto-formatters ;) Done + +/* Wrapper function of arm_get_required_vpr_reg with type == 1, so return + something only if the VPR reg is an input operand to the insn. */ When talking about a function parameter in comments capitalize (INSN) the name. Same for: Done +/* Wrapper function of arm_get_required_vpr_reg with type == 2, so return + something only if the VPR reg is the retrurn value, an output of, or is + clobbered by the insn. */ +/* Return true if an insn is an MVE instruction that VPT-predicable, but in + its unpredicated form, or if it is predicated, but on a predicate other + than vpr_reg. */ In this one also 'is a MVE instruction that is VPT-predicable' would be better I think. Oops, thanks for spotting. Done. On 15/06/2023 12:47, Stamatis Markianos-Wright via Gcc-patches wrote: > Hi all, > > This is the 2/2 patch that contains the functional changes needed > for MVE Tail Predicated Low Overhead Loops. See my previous email > for a general introduction of MVE LOLs. > > This support is added through the already existing loop-doloop > mechanisms that are used for non-MVE dls/le looping. > > Mid-end changes are: > > 1) Relax the loop-doloop mechanism in the mid-end to allow for > decrement numbers other that -1 and for `count` to be an > rtx containing a simple REG (which in this case will contain > the number of elements to be processed), rather > than an expression for calculating the number of iterations. > 2) Added a new df utility function: `df_bb_regno_only_def_find` that > will return the DEF of a REG only if it is DEF-ed once within the > basic block. > > And many things in the backend to implement the above optimisation: > > 3) Implement the `arm_predict_doloop_p` target hook to instruct the > mid-end about Low Overhead Loops (MVE or not), as well as > `arm_loop_unroll_adjust` which will prevent unrolling of any loops > that are valid for becoming MVE Tail_Predicated Low Overhead Loops > (unrolling can transform a loop in ways that invalidate the dlstp/ > letp tranformation logic and the benefit of the dlstp/letp loop
Re: [PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops
+ /* In order to find out if the loop is of type A or B above look for the + loop counter: it will either be incrementing by one per iteration or + it will be decrementing by num_of_lanes. We can find the loop counter + in the condition at the end of the loop. */ + rtx_insn *loop_cond = prev_nonnote_nondebug_insn_bb (BB_END (body)); + gcc_assert (cc_register (XEXP (PATTERN (loop_cond), 0), VOIDmode) + && GET_CODE (XEXP (PATTERN (loop_cond), 1)) == COMPARE); Not sure this should be an assert. If we do encounter a differently formed loop, we should bail out of DLSTPing for now but we shouldn't ICE. + /* The loop latch has to be empty. When compiling all the known MVE LoLs in + user applications, none of those with incrementing counters had any real + insns in the loop latch. As such, this function has only been tested with + an empty latch and may misbehave or ICE if we somehow get here with an + increment in the latch, so, for sanity, error out early. */ + rtx_insn *dec_insn = BB_END (body->loop_father->latch); + if (NONDEBUG_INSN_P (dec_insn)) +gcc_unreachable (); Similarly here I'd return false rather than gcc_unreachable (); + /* Find where both of those are modified in the loop body bb. */ + rtx condcount_reg_set = PATTERN (DF_REF_INSN (df_bb_regno_only_def_find +(body, REGNO (condcount; Put = on newline, breaks it down nicer. + counter_orig_set = XEXP (PATTERN + (DF_REF_INSN + (DF_REF_NEXT_REG + (DF_REG_DEF_CHAIN +(REGNO + (XEXP (condcount_reg_set, 0)), 1); This makes me a bit nervous, can we be certain that the PATTERN of the next insn that sets it is indeed a set. Heck can we even be sure DF_REG_DEF_CHAIN returns a non-null, I can't imagine why not but maybe there are some constructs it can't follow-up on? Might just be worth checking these steps and bailing out. + /* When we find the vctp instruction: This may be followed by + a zero-extend insn to SImode. If it is, then save the + zero-extended REG into vctp_vpr_generated. If there is no + zero-extend, then store the raw output of the vctp. + For any VPT-predicated instructions we need to ensure that + the VPR they use is the same as the one given here and + they often consume the output of a subreg of the SImode + zero-extended VPR-reg. As a result, comparing against the + output of the zero-extend is more likely to succeed. + This code also guarantees to us that the vctp comes before + any instructions that use the VPR within the loop, for the + dlstp/letp transform to succeed. */ Wrong comment indent after first line. + rtx_insn *vctp_insn = arm_mve_get_loop_vctp (body); + if (!vctp_insn || !arm_mve_loop_valid_for_dlstp (body)) +return GEN_INT (1); arm_mve_loop_valid_for_dlstp already calls arm_mve_get_loop_vctp, maybe have 'arm_mve_loop_valid_for_dlstp' return vctp_insn or null to determine success or failure, avoids looping through the BB again. For the same reason I'd also pass vctp_insn down to 'arm_mve_check_df_chain_back_for_implic_predic'. + if (GET_CODE (SET_SRC (single_set (next_use1))) == ZERO_EXTEND) + { + rtx_insn *next_use2 = NULL; Are we sure single_set can never return 0 here? Maybe worth an extra check and bail out if it does? + /* If the insn pattern requires the use of the VPR value from the + vctp as an input parameter. */ s/an an input parameter./as an input parameter for predication./ + /* None of registers USE-d by the instruction need can be the VPR +vctp_vpr_generated. This blocks the optimisation if there any +instructions that use the optimised-out VPR value in any way +other than as a VPT block predicate. */ Reword this slightly to be less complex: This instruction USE-s the vctp_vpr_generated other than for predication, this blocks the transformation as we are not allowed to optimise the VPR value away. Will continue reviewing next week :) On 15/06/2023 12:47, Stamatis Markianos-Wright via Gcc-patches wrote: Hi all, This is the 2/2 patch that contains the functional changes needed for MVE Tail Predicated Low Overhead Loops. See my previous email for a general introduction of MVE LOLs. This support is added through the already existing loop-doloop mechanisms that are used for non-MVE dls/le looping. Mid-end changes are: 1) Relax the loop-doloop mechanism in the mid-end to allow for decrement numbers other that -1 and for `count` to be an rtx containing a simple REG (which in this case will contain the number of elements to be processed),
Re: [PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops
+ if (insn != arm_mve_get_loop_vctp (body)) +{ probably a good idea to invert the condition here and return false, helps reducing the indenting in this function. + /* Starting from the current insn, scan backwards through the insn + chain until BB_HEAD: "for each insn in the BB prior to the current". + */ There's a trailing whitespace after insn, but also I'd rewrite this bit. The "for each insn in the BB prior to the current" is superfluous and even confusing to me. How about: "Scan backwards from the current INSN through the instruction chain until the start of the basic block. " I find 'that previous insn' to be confusing as you don't mention any previous insn before. So how about something along the lines of: 'If a previous insn defines a register that INSN uses then return true if...' Do we need to check: 'insn != prev_insn' ? Any reason why you can't start the loop with: 'for (rtx_insn *prev_insn = PREV_INSN (insn);' Now I also found a case where things might go wrong in: + /* Look at all the DEFs of that previous insn: if one of them is on + the same REG as our current insn, then recurse in order to check + that insn's USEs. If any of these insns return true as + MVE_VPT_UNPREDICATED_INSN_Ps, then the whole chain is affected + by the change in behaviour from being placed in dlstp/letp loop. + */ + df_ref prev_insn_defs = NULL; + FOR_EACH_INSN_DEF (prev_insn_defs, prev_insn) + { + if (DF_REF_REGNO (insn_uses) == DF_REF_REGNO (prev_insn_defs) + && insn != prev_insn + && body == BLOCK_FOR_INSN (prev_insn) + && !arm_mve_vec_insn_is_predicated_with_this_predicate +(insn, vctp_vpr_generated) + && arm_mve_check_df_chain_back_for_implic_predic +(prev_insn, vctp_vpr_generated)) + return true; + } The body == BLOCK_FOR_INSN (prev_insn) hinted me at it, if a def comes from outside of the BB (so outside of the loop's body) then its by definition unpredicated by vctp. I think you want to check that if prev_insn defines a register used by insn then return true if prev_insn isn't in the same BB or has a chain that is not predicated, i.e.: '!arm_mve_vec_insn_is_predicated_with_this_predicate (insn, vctp_vpr_generated) && arm_mve_check_df_chain_back_for_implic_predic prev_insn, vctp_vpr_generated))' you check body != BLOCK_FOR_INSN (prev_insn)' I also found some other issues, this currently loloops: uint16_t test (uint16_t *a, int n) { uint16_t res =0; while (n > 0) { mve_pred16_t p = vctp16q (n); uint16x8_t va = vldrhq_u16 (a); res = vaddvaq_u16 (res, va); res = vaddvaq_p_u16 (res, va, p); a += 8; n -= 8; } return res; } But it shouldn't, this is because there's a lack of handling of across vector instructions. Luckily in MVE all across vector instructions have the side-effect that they write to a scalar register, even the vshlcq instruction (it writes to a scalar carry output). Did this lead me to find an ICE with: uint16x8_t test (uint16_t *a, int n) { uint16x8_t res = vdupq_n_u16 (0); while (n > 0) { uint16_t carry = 0; mve_pred16_t p = vctp16q (n); uint16x8_t va = vldrhq_u16 (a); res = vshlcq_u16 (va, , 1); res = vshlcq_m_u16 (res, , 1 , p); a += 8; n -= 8; } return res; } This is because: + /* If the USE is outside the loop body bb, or it is inside, but +is an unpredicated store to memory. */ + if (BLOCK_FOR_INSN (insn) != BLOCK_FOR_INSN (next_use_insn) +|| (arm_mve_vec_insn_is_unpredicated_or_uses_other_predicate +(next_use_insn, vctp_vpr_generated) + && mve_memory_operand + (SET_DEST (single_set (next_use_insn)), +GET_MODE (SET_DEST (single_set (next_use_insn)) + return true; Assumes single_set doesn't return 0. Let's deal with these issues and I'll continue to review. On 15/06/2023 12:47, Stamatis Markianos-Wright via Gcc-patches wrote: Hi all, This is the 2/2 patch that contains the functional changes needed for MVE Tail Predicated Low Overhead Loops. See my previous email for a general introduction of MVE LOLs. This support is added through the already existing loop-doloop mechanisms that are used for non-MVE dls/le looping. Mid-end changes are: 1) Relax the loop-doloop mechanism in the mid-end to allow for decrement numbers other that -1 and for `count` to be an rtx containing a simple REG (which in this case will contain the number of elements to be processed), rather than an expression for calculating the number of iterations.
Re: [PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops
Some comments below, all quite minor. I'll continue to review tomorrow, I need a fresher brain for arm_mve_check_df_chain_back_for_implic_predic ;) +static int +arm_mve_get_vctp_lanes (rtx x) +{ + if (GET_CODE (x) == SET && GET_CODE (XEXP (x, 1)) == UNSPEC + && (XINT (XEXP (x, 1), 1) == VCTP || XINT (XEXP (x, 1), 1) == VCTP_M)) +{ + switch (GET_MODE (XEXP (x, 1))) + { + case V16BImode: + return 16; + case V8BImode: + return 8; + case V4BImode: + return 4; + case V2QImode: + return 2; + default: + break; + } +} + return 0; +} I think you can replace the switch with something along the lines of: machine_mode mode = GET_MODE (XEXP (x, 1)); return VECTOR_MODE_P (mode) ? GET_MODE_NUNITS (mode) : 0; +/* Check if an insn requires the use of the VPR_REG, if it does, return the + sub-rtx of the VPR_REG. The `type` argument controls whether + this function should: + * For type == 0, check all operands, including the OUT operands, + and return the first occurance of the VPR_REG. s/occurance/occurrence/ + bool requires_vpr; + extract_constrain_insn (insn); indent of requires_vpr is off. + if (type == 1 && (recog_data.operand_type[op] == OP_OUT + || recog_data.operand_type[op] == OP_INOUT)) + continue; + else if (type == 2 && (recog_data.operand_type[op] == OP_IN +|| recog_data.operand_type[op] == OP_INOUT)) + continue; Why skip INOUT? I guess this will become clear when I see the uses, but I'm wondering whether 'only check the input operands.' is clear enough. Maybe 'check operands that are input only.' would be more accurate? + /* Fetch the reg_class for each entry and check it against the + * VPR_REG reg_class. */ Remove leading * on the second line. + +/* Wrapper function of arm_get_required_vpr_reg with type == 1, so return + something only if the VPR reg is an input operand to the insn. */ When talking about a function parameter in comments capitalize (INSN) the name. Same for: +/* Wrapper function of arm_get_required_vpr_reg with type == 2, so return + something only if the VPR reg is the retrurn value, an output of, or is + clobbered by the insn. */ +/* Return true if an insn is an MVE instruction that VPT-predicable, but in + its unpredicated form, or if it is predicated, but on a predicate other + than vpr_reg. */ In this one also 'is a MVE instruction that is VPT-predicable' would be better I think. On 15/06/2023 12:47, Stamatis Markianos-Wright via Gcc-patches wrote: > Hi all, > > This is the 2/2 patch that contains the functional changes needed > for MVE Tail Predicated Low Overhead Loops. See my previous email > for a general introduction of MVE LOLs. > > This support is added through the already existing loop-doloop > mechanisms that are used for non-MVE dls/le looping. > > Mid-end changes are: > > 1) Relax the loop-doloop mechanism in the mid-end to allow for > decrement numbers other that -1 and for `count` to be an > rtx containing a simple REG (which in this case will contain > the number of elements to be processed), rather > than an expression for calculating the number of iterations. > 2) Added a new df utility function: `df_bb_regno_only_def_find` that > will return the DEF of a REG only if it is DEF-ed once within the > basic block. > > And many things in the backend to implement the above optimisation: > > 3) Implement the `arm_predict_doloop_p` target hook to instruct the > mid-end about Low Overhead Loops (MVE or not), as well as > `arm_loop_unroll_adjust` which will prevent unrolling of any loops > that are valid for becoming MVE Tail_Predicated Low Overhead Loops > (unrolling can transform a loop in ways that invalidate the dlstp/ > letp tranformation logic and the benefit of the dlstp/letp loop > would be considerably higher than that of unrolling) > 4) Appropriate changes to the define_expand of doloop_end, new > patterns for dlstp and letp, new iterators, unspecs, etc. > 5) `arm_mve_loop_valid_for_dlstp` and a number of checking functions: > * `arm_mve_dlstp_check_dec_counter` > * `arm_mve_dlstp_check_inc_counter` > * `arm_mve_check_reg_origin_is_num_elems` > * `arm_mve_check_df_chain_back_for_implic_predic` > * `arm_mve_check_df_chain_fwd_for_implic_predic_impact` > This all, in smoe way or another, are running checks on the loop > structure in order to determine if the loop is valid for dlstp/letp > transformation. > 6) `arm_attempt_dlstp_transform`: (called from the define_expand of > doloop_end) this function re-checks for the loop's suitability for
[PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops
Hi all, This is the 2/2 patch that contains the functional changes needed for MVE Tail Predicated Low Overhead Loops. See my previous email for a general introduction of MVE LOLs. This support is added through the already existing loop-doloop mechanisms that are used for non-MVE dls/le looping. Mid-end changes are: 1) Relax the loop-doloop mechanism in the mid-end to allow for decrement numbers other that -1 and for `count` to be an rtx containing a simple REG (which in this case will contain the number of elements to be processed), rather than an expression for calculating the number of iterations. 2) Added a new df utility function: `df_bb_regno_only_def_find` that will return the DEF of a REG only if it is DEF-ed once within the basic block. And many things in the backend to implement the above optimisation: 3) Implement the `arm_predict_doloop_p` target hook to instruct the mid-end about Low Overhead Loops (MVE or not), as well as `arm_loop_unroll_adjust` which will prevent unrolling of any loops that are valid for becoming MVE Tail_Predicated Low Overhead Loops (unrolling can transform a loop in ways that invalidate the dlstp/ letp tranformation logic and the benefit of the dlstp/letp loop would be considerably higher than that of unrolling) 4) Appropriate changes to the define_expand of doloop_end, new patterns for dlstp and letp, new iterators, unspecs, etc. 5) `arm_mve_loop_valid_for_dlstp` and a number of checking functions: * `arm_mve_dlstp_check_dec_counter` * `arm_mve_dlstp_check_inc_counter` * `arm_mve_check_reg_origin_is_num_elems` * `arm_mve_check_df_chain_back_for_implic_predic` * `arm_mve_check_df_chain_fwd_for_implic_predic_impact` This all, in smoe way or another, are running checks on the loop structure in order to determine if the loop is valid for dlstp/letp transformation. 6) `arm_attempt_dlstp_transform`: (called from the define_expand of doloop_end) this function re-checks for the loop's suitability for dlstp/letp transformation and then implements it, if possible. 7) Various utility functions: *`arm_mve_get_vctp_lanes` to map from vctp unspecs to number of lanes, and `arm_get_required_vpr_reg` to check an insn to see if it requires the VPR or not. * `arm_mve_get_loop_vctp` * `arm_mve_get_vctp_lanes` * `arm_emit_mve_unpredicated_insn_to_seq` * `arm_get_required_vpr_reg` * `arm_get_required_vpr_reg_param` * `arm_get_required_vpr_reg_ret_val` * `arm_mve_vec_insn_is_predicated_with_this_predicate` * `arm_mve_vec_insn_is_unpredicated_or_uses_other_predicate` No regressions on arm-none-eabi with various targets and on aarch64-none-elf. Thoughts on getting this into trunk? Thank you, Stam Markianos-Wright gcc/ChangeLog: * config/arm/arm-protos.h (arm_target_insn_ok_for_lob): Rename to... (arm_target_bb_ok_for_lob): ...this (arm_attempt_dlstp_transform): New. * config/arm/arm.cc (TARGET_LOOP_UNROLL_ADJUST): New. (TARGET_PREDICT_DOLOOP_P): New. (arm_block_set_vect): (arm_target_insn_ok_for_lob): Rename from arm_target_insn_ok_for_lob. (arm_target_bb_ok_for_lob): New. (arm_mve_get_vctp_lanes): New. (arm_get_required_vpr_reg): New. (arm_get_required_vpr_reg_param): New. (arm_get_required_vpr_reg_ret_val): New. (arm_mve_get_loop_vctp): New. (arm_mve_vec_insn_is_unpredicated_or_uses_other_predicate): New. (arm_mve_vec_insn_is_predicated_with_this_predicate): New. (arm_mve_check_df_chain_back_for_implic_predic): New. (arm_mve_check_df_chain_fwd_for_implic_predic_impact): New. (arm_mve_check_reg_origin_is_num_elems): New. (arm_mve_dlstp_check_inc_counter): New. (arm_mve_dlstp_check_dec_counter): New. (arm_mve_loop_valid_for_dlstp): New. (arm_predict_doloop_p): New. (arm_loop_unroll_adjust): New. (arm_emit_mve_unpredicated_insn_to_seq): New. (arm_attempt_dlstp_transform): New. * config/arm/iterators.md (DLSTP): New. (mode1): Add DLSTP mappings. * config/arm/mve.md (*predicated_doloop_end_internal): New. (dlstp_insn): New. * config/arm/thumb2.md (doloop_end): Update for MVE LOLs. * config/arm/unspecs.md: New unspecs. * df-core.cc (df_bb_regno_only_def_find): New. * df.h (df_bb_regno_only_def_find): New. * loop-doloop.cc (doloop_condition_get): Relax conditions. (doloop_optimize): Add support for elementwise LoLs. gcc/testsuite/ChangeLog: *
[PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops
On 11/15/22 15:51, Andre Vieira (lists) wrote: On 11/11/2022 17:40, Stam Markianos-Wright via Gcc-patches wrote: Hi all, This is the 2/2 patch that contains the functional changes needed for MVE Tail Predicated Low Overhead Loops. See my previous email for a general introduction of MVE LOLs. This support is added through the already existing loop-doloop mechanisms that are used for non-MVE dls/le looping. Changes are: 1) Relax the loop-doloop mechanism in the mid-end to allow for decrement numbers other that -1 and for `count` to be an rtx containing the number of elements to be processed, rather than an expression for calculating the number of iterations. 2) Add a `allow_elementwise_doloop` target hook. This allows the target backend to manipulate the iteration count as it needs: in our case to change it from a pre-calculation of the number of iterations to the number of elements to be processed. 3) The doloop_end target-insn now had an additional parameter: the `count` (note: this is before it gets modified to just be the number of elements), so that the decrement value is extracted from that parameter. And many things in the backend to implement the above optimisation: 4) Appropriate changes to the define_expand of doloop_end and new patterns for dlstp and letp. 5) `arm_attempt_dlstp_transform`: (called from the define_expand of doloop_end) this function checks for the loop's suitability for dlstp/letp transformation and then implements it, if possible. 6) `arm_mve_get_loop_unique_vctp`: A function that loops through the loop contents and returns the vctp VPR-genereting operation within the loop, if it is unique and there is exclusively one vctp within the loop. 7) A couple of utility functions: `arm_mve_get_vctp_lanes` to map from vctp unspecs to number of lanes, and `arm_get_required_vpr_reg` to check an insn to see if it requires the VPR or not. No regressions on arm-none-eabi with various targets and on aarch64-none-elf. Thoughts on getting this into trunk? Thank you, Stam Markianos-Wright gcc/ChangeLog: * config/aarch64/aarch64.md: Add extra doloop_end arg. * config/arm/arm-protos.h (arm_attempt_dlstp_transform): New. * config/arm/arm.cc (TARGET_ALLOW_ELEMENTWISE_DOLOOP): New. (arm_mve_get_vctp_lanes): New. (arm_get_required_vpr_reg): New. (arm_mve_get_loop_unique_vctp): New. (arm_attempt_dlstp_transform): New. (arm_allow_elementwise_doloop): New. * config/arm/iterators.md: * config/arm/mve.md (*predicated_doloop_end_internal): New. (dlstp_insn): New. * config/arm/thumb2.md (doloop_end): Update for MVE LOLs. * config/arm/unspecs.md: New unspecs. * config/ia64/ia64.md: Add extra doloop_end arg. * config/pru/pru.md: Add extra doloop_end arg. * config/rs6000/rs6000.md: Add extra doloop_end arg. * config/s390/s390.md: Add extra doloop_end arg. * config/v850/v850.md: Add extra doloop_end arg. * doc/tm.texi: Document new hook. * doc/tm.texi.in: Likewise. * loop-doloop.cc (doloop_condition_get): Relax conditions. (doloop_optimize): Add support for elementwise LoLs. * target-insns.def (doloop_end): Add extra arg. * target.def (allow_elementwise_doloop): New hook. * targhooks.cc (default_allow_elementwise_doloop): New. * targhooks.h (default_allow_elementwise_doloop): New. gcc/testsuite/ChangeLog: * gcc.target/arm/lob.h: Update framework. * gcc.target/arm/lob1.c: Likewise. * gcc.target/arm/lob6.c: Likewise. * gcc.target/arm/dlstp-int16x8.c: New test. * gcc.target/arm/dlstp-int32x4.c: New test. * gcc.target/arm/dlstp-int64x2.c: New test. * gcc.target/arm/dlstp-int8x16.c: New test. ### Inline copy of patch ### diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index f2e3d905dbbeb2949f2947f5cfd68208c94c9272..7a6d24a80060b4a704a481ccd1a32d96e7b0f369 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -7366,7 +7366,8 @@ ;; knows what to generate. (define_expand "doloop_end" [(use (match_operand 0 "" "")) ; loop pseudo - (use (match_operand 1 "" ""))] ; label + (use (match_operand 1 "" "")) ; label + (use (match_operand 2 "" ""))] ; decrement constant "optimize > 0 && flag_modulo_sched" { rtx s0; diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h index 550272facd12e60a49bf8a3b20f811cc13765b3a..7684620f0f4d161dd9e9ad2d70308021ec3d3d34 100644 --- a/gcc/config/arm/arm-protos.h +++ b/gcc/config/arm/arm-protos.h @@ -63,7 +63,7 @@ extern void arm_decompose_di_binop (rtx, rtx, rtx *, rtx *, rtx *, rtx *); extern bool arm_q_bit_access (void); extern bool arm_ge_bits_access (void); extern bool arm_target_insn_ok_for_lob (rtx); - +extern rtx
Re: [PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops
On 11/11/2022 17:40, Stam Markianos-Wright via Gcc-patches wrote: Hi all, This is the 2/2 patch that contains the functional changes needed for MVE Tail Predicated Low Overhead Loops. See my previous email for a general introduction of MVE LOLs. This support is added through the already existing loop-doloop mechanisms that are used for non-MVE dls/le looping. Changes are: 1) Relax the loop-doloop mechanism in the mid-end to allow for decrement numbers other that -1 and for `count` to be an rtx containing the number of elements to be processed, rather than an expression for calculating the number of iterations. 2) Add a `allow_elementwise_doloop` target hook. This allows the target backend to manipulate the iteration count as it needs: in our case to change it from a pre-calculation of the number of iterations to the number of elements to be processed. 3) The doloop_end target-insn now had an additional parameter: the `count` (note: this is before it gets modified to just be the number of elements), so that the decrement value is extracted from that parameter. And many things in the backend to implement the above optimisation: 4) Appropriate changes to the define_expand of doloop_end and new patterns for dlstp and letp. 5) `arm_attempt_dlstp_transform`: (called from the define_expand of doloop_end) this function checks for the loop's suitability for dlstp/letp transformation and then implements it, if possible. 6) `arm_mve_get_loop_unique_vctp`: A function that loops through the loop contents and returns the vctp VPR-genereting operation within the loop, if it is unique and there is exclusively one vctp within the loop. 7) A couple of utility functions: `arm_mve_get_vctp_lanes` to map from vctp unspecs to number of lanes, and `arm_get_required_vpr_reg` to check an insn to see if it requires the VPR or not. No regressions on arm-none-eabi with various targets and on aarch64-none-elf. Thoughts on getting this into trunk? Thank you, Stam Markianos-Wright gcc/ChangeLog: * config/aarch64/aarch64.md: Add extra doloop_end arg. * config/arm/arm-protos.h (arm_attempt_dlstp_transform): New. * config/arm/arm.cc (TARGET_ALLOW_ELEMENTWISE_DOLOOP): New. (arm_mve_get_vctp_lanes): New. (arm_get_required_vpr_reg): New. (arm_mve_get_loop_unique_vctp): New. (arm_attempt_dlstp_transform): New. (arm_allow_elementwise_doloop): New. * config/arm/iterators.md: * config/arm/mve.md (*predicated_doloop_end_internal): New. (dlstp_insn): New. * config/arm/thumb2.md (doloop_end): Update for MVE LOLs. * config/arm/unspecs.md: New unspecs. * config/ia64/ia64.md: Add extra doloop_end arg. * config/pru/pru.md: Add extra doloop_end arg. * config/rs6000/rs6000.md: Add extra doloop_end arg. * config/s390/s390.md: Add extra doloop_end arg. * config/v850/v850.md: Add extra doloop_end arg. * doc/tm.texi: Document new hook. * doc/tm.texi.in: Likewise. * loop-doloop.cc (doloop_condition_get): Relax conditions. (doloop_optimize): Add support for elementwise LoLs. * target-insns.def (doloop_end): Add extra arg. * target.def (allow_elementwise_doloop): New hook. * targhooks.cc (default_allow_elementwise_doloop): New. * targhooks.h (default_allow_elementwise_doloop): New. gcc/testsuite/ChangeLog: * gcc.target/arm/lob.h: Update framework. * gcc.target/arm/lob1.c: Likewise. * gcc.target/arm/lob6.c: Likewise. * gcc.target/arm/dlstp-int16x8.c: New test. * gcc.target/arm/dlstp-int32x4.c: New test. * gcc.target/arm/dlstp-int64x2.c: New test. * gcc.target/arm/dlstp-int8x16.c: New test. ### Inline copy of patch ### diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index f2e3d905dbbeb2949f2947f5cfd68208c94c9272..7a6d24a80060b4a704a481ccd1a32d96e7b0f369 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -7366,7 +7366,8 @@ ;; knows what to generate. (define_expand "doloop_end" [(use (match_operand 0 "" "")) ; loop pseudo - (use (match_operand 1 "" ""))] ; label + (use (match_operand 1 "" "")) ; label + (use (match_operand 2 "" ""))] ; decrement constant "optimize > 0 && flag_modulo_sched" { rtx s0; diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h index 550272facd12e60a49bf8a3b20f811cc13765b3a..7684620f0f4d161dd9e9ad2d70308021ec3d3d34 100644 --- a/gcc/config/arm/arm-protos.h +++ b/gcc/config/arm/arm-protos.h @@ -63,7 +63,7 @@ extern void arm_decompose_di_binop (rtx, rtx, rtx *, rtx *, rtx *, rtx *); extern bool arm_q_bit_access (void); extern bool arm_ge_bits_access (void); extern bool arm_target_insn_ok_for_lob (rtx); - +extern rtx arm_attempt_dlstp_transform (rtx, rtx); #ifdef RTX_CODE enum reg_class
[PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops
Hi all, This is the 2/2 patch that contains the functional changes needed for MVE Tail Predicated Low Overhead Loops. See my previous email for a general introduction of MVE LOLs. This support is added through the already existing loop-doloop mechanisms that are used for non-MVE dls/le looping. Changes are: 1) Relax the loop-doloop mechanism in the mid-end to allow for decrement numbers other that -1 and for `count` to be an rtx containing the number of elements to be processed, rather than an expression for calculating the number of iterations. 2) Add a `allow_elementwise_doloop` target hook. This allows the target backend to manipulate the iteration count as it needs: in our case to change it from a pre-calculation of the number of iterations to the number of elements to be processed. 3) The doloop_end target-insn now had an additional parameter: the `count` (note: this is before it gets modified to just be the number of elements), so that the decrement value is extracted from that parameter. And many things in the backend to implement the above optimisation: 4) Appropriate changes to the define_expand of doloop_end and new patterns for dlstp and letp. 5) `arm_attempt_dlstp_transform`: (called from the define_expand of doloop_end) this function checks for the loop's suitability for dlstp/letp transformation and then implements it, if possible. 6) `arm_mve_get_loop_unique_vctp`: A function that loops through the loop contents and returns the vctp VPR-genereting operation within the loop, if it is unique and there is exclusively one vctp within the loop. 7) A couple of utility functions: `arm_mve_get_vctp_lanes` to map from vctp unspecs to number of lanes, and `arm_get_required_vpr_reg` to check an insn to see if it requires the VPR or not. No regressions on arm-none-eabi with various targets and on aarch64-none-elf. Thoughts on getting this into trunk? Thank you, Stam Markianos-Wright gcc/ChangeLog: * config/aarch64/aarch64.md: Add extra doloop_end arg. * config/arm/arm-protos.h (arm_attempt_dlstp_transform): New. * config/arm/arm.cc (TARGET_ALLOW_ELEMENTWISE_DOLOOP): New. (arm_mve_get_vctp_lanes): New. (arm_get_required_vpr_reg): New. (arm_mve_get_loop_unique_vctp): New. (arm_attempt_dlstp_transform): New. (arm_allow_elementwise_doloop): New. * config/arm/iterators.md: * config/arm/mve.md (*predicated_doloop_end_internal): New. (dlstp_insn): New. * config/arm/thumb2.md (doloop_end): Update for MVE LOLs. * config/arm/unspecs.md: New unspecs. * config/ia64/ia64.md: Add extra doloop_end arg. * config/pru/pru.md: Add extra doloop_end arg. * config/rs6000/rs6000.md: Add extra doloop_end arg. * config/s390/s390.md: Add extra doloop_end arg. * config/v850/v850.md: Add extra doloop_end arg. * doc/tm.texi: Document new hook. * doc/tm.texi.in: Likewise. * loop-doloop.cc (doloop_condition_get): Relax conditions. (doloop_optimize): Add support for elementwise LoLs. * target-insns.def (doloop_end): Add extra arg. * target.def (allow_elementwise_doloop): New hook. * targhooks.cc (default_allow_elementwise_doloop): New. * targhooks.h (default_allow_elementwise_doloop): New. gcc/testsuite/ChangeLog: * gcc.target/arm/lob.h: Update framework. * gcc.target/arm/lob1.c: Likewise. * gcc.target/arm/lob6.c: Likewise. * gcc.target/arm/dlstp-int16x8.c: New test. * gcc.target/arm/dlstp-int32x4.c: New test. * gcc.target/arm/dlstp-int64x2.c: New test. * gcc.target/arm/dlstp-int8x16.c: New test. ### Inline copy of patch ### diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index f2e3d905dbbeb2949f2947f5cfd68208c94c9272..7a6d24a80060b4a704a481ccd1a32d96e7b0f369 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -7366,7 +7366,8 @@ ;; knows what to generate. (define_expand "doloop_end" [(use (match_operand 0 "" "")) ; loop pseudo - (use (match_operand 1 "" ""))] ; label + (use (match_operand 1 "" "")) ; label + (use (match_operand 2 "" ""))] ; decrement constant "optimize > 0 && flag_modulo_sched" { rtx s0; diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h index 550272facd12e60a49bf8a3b20f811cc13765b3a..7684620f0f4d161dd9e9ad2d70308021ec3d3d34 100644 --- a/gcc/config/arm/arm-protos.h +++ b/gcc/config/arm/arm-protos.h @@ -63,7 +63,7 @@ extern void arm_decompose_di_binop (rtx, rtx, rtx *, rtx *, rtx *, rtx *); extern bool arm_q_bit_access (void); extern bool arm_ge_bits_access (void); extern bool arm_target_insn_ok_for_lob (rtx); - +extern rtx arm_attempt_dlstp_transform (rtx, rtx); #ifdef RTX_CODE enum reg_class arm_mode_base_reg_class (machine_mode); diff --git