arm: Remove unsigned variant of vcaddq_m

2023-08-01 Thread Stamatis Markianos-Wright via Gcc-patches

Hi all,

The unsigned variants of the vcaddq_m operation are not needed within the
compiler, as the assembly output of the signed and unsigned versions of the
ops is identical: with a `.i` suffix (as opposed to separate `.s` and `.u`
suffixes).

Tested with baremetal arm-none-eabi on Arm's fastmodels.

Ok for trunk?

Thanks,
Stamatis Markianos-Wright

gcc/ChangeLog:

    * config/arm/arm-mve-builtins-base.cc (vcaddq_rot90, vcaddq_rot270):
      Use common insn for signed and unsigned front-end definitions.
    * config/arm/arm_mve_builtins.def
      (vcaddq_rot90_m_u, vcaddq_rot270_m_u): Make common.
      (vcaddq_rot90_m_s, vcaddq_rot270_m_s): Remove.
    * config/arm/iterators.md (mve_insn): Merge signed and unsigned defs.
      (isu): Likewise.
      (rot): Likewise.
      (mve_rot): Likewise.
      (supf): Likewise.
      (VxCADDQ_M): Likewise.
    * config/arm/unspecs.md (unspec): Likewise.
---
 gcc/config/arm/arm-mve-builtins-base.cc |  4 ++--
 gcc/config/arm/arm_mve_builtins.def |  6 ++---
 gcc/config/arm/iterators.md | 30 +++--
 gcc/config/arm/mve.md   |  4 ++--
 gcc/config/arm/unspecs.md   |  6 ++---
 5 files changed, 21 insertions(+), 29 deletions(-)

diff --git a/gcc/config/arm/arm-mve-builtins-base.cc 
b/gcc/config/arm/arm-mve-builtins-base.cc

index e31095ae112..426a87e9852 100644
--- a/gcc/config/arm/arm-mve-builtins-base.cc
+++ b/gcc/config/arm/arm-mve-builtins-base.cc
@@ -260,8 +260,8 @@ FUNCTION_PRED_P_S_U (vaddvq, VADDVQ)
 FUNCTION_PRED_P_S_U (vaddvaq, VADDVAQ)
 FUNCTION_WITH_RTX_M (vandq, AND, VANDQ)
 FUNCTION_ONLY_N (vbrsrq, VBRSRQ)
-FUNCTION (vcaddq_rot90, unspec_mve_function_exact_insn_rot, 
(UNSPEC_VCADD90, UNSPEC_VCADD90, UNSPEC_VCADD90, VCADDQ_ROT90_M_S, 
VCADDQ_ROT90_M_U, VCADDQ_ROT90_M_F))
-FUNCTION (vcaddq_rot270, unspec_mve_function_exact_insn_rot, 
(UNSPEC_VCADD270, UNSPEC_VCADD270, UNSPEC_VCADD270, VCADDQ_ROT270_M_S, 
VCADDQ_ROT270_M_U, VCADDQ_ROT270_M_F))
+FUNCTION (vcaddq_rot90, unspec_mve_function_exact_insn_rot, 
(UNSPEC_VCADD90, UNSPEC_VCADD90, UNSPEC_VCADD90, VCADDQ_ROT90_M, 
VCADDQ_ROT90_M, VCADDQ_ROT90_M_F))
+FUNCTION (vcaddq_rot270, unspec_mve_function_exact_insn_rot, 
(UNSPEC_VCADD270, UNSPEC_VCADD270, UNSPEC_VCADD270, VCADDQ_ROT270_M, 
VCADDQ_ROT270_M, VCADDQ_ROT270_M_F))
 FUNCTION (vcmlaq, unspec_mve_function_exact_insn_rot, (-1, -1, 
UNSPEC_VCMLA, -1, -1, VCMLAQ_M_F))
 FUNCTION (vcmlaq_rot90, unspec_mve_function_exact_insn_rot, (-1, -1, 
UNSPEC_VCMLA90, -1, -1, VCMLAQ_ROT90_M_F))
 FUNCTION (vcmlaq_rot180, unspec_mve_function_exact_insn_rot, (-1, -1, 
UNSPEC_VCMLA180, -1, -1, VCMLAQ_ROT180_M_F))
diff --git a/gcc/config/arm/arm_mve_builtins.def 
b/gcc/config/arm/arm_mve_builtins.def

index 43dacc3dda1..6ac1812c697 100644
--- a/gcc/config/arm/arm_mve_builtins.def
+++ b/gcc/config/arm/arm_mve_builtins.def
@@ -523,8 +523,8 @@ VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, 
vhsubq_m_n_u, v16qi, v8hi, v4si)

 VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vhaddq_m_u, v16qi, v8hi, v4si)
 VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vhaddq_m_n_u, v16qi, v8hi, 
v4si)

 VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, veorq_m_u, v16qi, v8hi, v4si)
-VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vcaddq_rot90_m_u, v16qi, 
v8hi, v4si)
-VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vcaddq_rot270_m_u, v16qi, 
v8hi, v4si)
+VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vcaddq_rot90_m_, v16qi, 
v8hi, v4si)
+VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vcaddq_rot270_m_, v16qi, 
v8hi, v4si)

 VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vbicq_m_u, v16qi, v8hi, v4si)
 VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vandq_m_u, v16qi, v8hi, v4si)
 VAR3 (QUADOP_UNONE_UNONE_UNONE_UNONE_PRED, vaddq_m_u, v16qi, v8hi, v4si)
@@ -587,8 +587,6 @@ VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, 
vhcaddq_rot270_m_s, v16qi, v8hi, v4si)

 VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vhaddq_m_s, v16qi, v8hi, v4si)
 VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vhaddq_m_n_s, v16qi, v8hi, v4si)
 VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, veorq_m_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vcaddq_rot90_m_s, v16qi, v8hi, v4si)
-VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vcaddq_rot270_m_s, v16qi, v8hi, 
v4si)

 VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vbrsrq_m_n_s, v16qi, v8hi, v4si)
 VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vbicq_m_s, v16qi, v8hi, v4si)
 VAR3 (QUADOP_NONE_NONE_NONE_NONE_PRED, vandq_m_s, v16qi, v8hi, v4si)
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index b13ff53d36f..2edd0b06370 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -941,8 +941,8 @@
      (VBICQ_N_S "vbic") (VBICQ_N_U "vbic")
      (VBRSRQ_M_N_S "vbrsr") (VBRSRQ_M_N_U "vbrsr") (VBRSRQ_M_N_F 
"vbrsr")

      (VBRSRQ_N_S "vbrsr") (VBRSRQ_N_U "vbrsr") (VBRSRQ_N_F "vbrsr")
-         (VCADDQ_ROT270_M_U "vcadd") (VCADDQ_ROT270_M_S "vcadd") 
(VCADDQ_ROT270_M_F "vcadd")
-         (VCADDQ_ROT90_M_U "vcadd") (VCADDQ_ROT90_M_S "vcadd") 
(VCADDQ_ROT90_M_F 

Re: [PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops

2023-07-05 Thread Stamatis Markianos-Wright via Gcc-patches



On 23/06/2023 11:23, Andre Vieira (lists) wrote:

+  if (insn != arm_mve_get_loop_vctp (body))
+    {

probably a good idea to invert the condition here and return false, 
helps reducing the indenting in this function.


Done, thanks





+    /* Starting from the current insn, scan backwards through the insn
+   chain until BB_HEAD: "for each insn in the BB prior to the 
current".

+    */

There's a trailing whitespace after insn, but also I'd rewrite this 
bit. The "for each insn in the BB prior to the current" is superfluous 
and even confusing to me. How about:
"Scan backwards from the current INSN through the instruction chain 
until the start of the basic block.  "

Yes, agreed, it wasn't very clear. Done.



 I find 'that previous insn' to be confusing as you don't mention any 
previous insn before. So how about something along the lines of:
'If a previous insn defines a register that INSN uses then return true 
if...'

Done



Do we need to check: 'insn != prev_insn' ? Any reason why you can't 
start the loop with:

'for (rtx_insn *prev_insn = PREV_INSN (insn);'


True! Done.



Now I also found a case where things might go wrong in:
+    /* Look at all the DEFs of that previous insn: if one of them 
is on
+   the same REG as our current insn, then recurse in order to 
check

+   that insn's USEs.  If any of these insns return true as
+   MVE_VPT_UNPREDICATED_INSN_Ps, then the whole chain is 
affected
+   by the change in behaviour from being placed in dlstp/letp 
loop.

+    */
+    df_ref prev_insn_defs = NULL;
+    FOR_EACH_INSN_DEF (prev_insn_defs, prev_insn)
+  {
+    if (DF_REF_REGNO (insn_uses) == DF_REF_REGNO (prev_insn_defs)
+    && insn != prev_insn
+    && body == BLOCK_FOR_INSN (prev_insn)
+    && !arm_mve_vec_insn_is_predicated_with_this_predicate
+ (insn, vctp_vpr_generated)
+    && arm_mve_check_df_chain_back_for_implic_predic
+ (prev_insn, vctp_vpr_generated))
+  return true;
+  }

The body == BLOCK_FOR_INSN (prev_insn) hinted me at it, if a def comes 
from outside of the BB (so outside of the loop's body) then its by 
definition unpredicated by vctp.  I think you want to check that if 
prev_insn defines a register used by insn then return true if 
prev_insn isn't in the same BB or has a chain that is not predicated, 
i.e.: '!arm_mve_vec_insn_is_predicated_with_this_predicate (insn, 
vctp_vpr_generated) && arm_mve_check_df_chain_back_for_implic_predic 
prev_insn, vctp_vpr_generated))' you check body != BLOCK_FOR_INSN 
(prev_insn)'


Yes, you're right, this is vulnerable here. A neater fix to this (I 
think?) is to make the above REGNO_REG_SET_P more generic, so that it 
covers all scalar values and scalar ops, as well.
Then it's a "if this insn in the loop has any input that originates 
outside the bb, then it's unsafe" check and the recursive loop backwards 
is only for the recursive "are any previous insns unsafe"






I also found some other issues, this currently loloops:

uint16_t  test (uint16_t *a, int n)
{
  uint16_t res =0;
  while (n > 0)
    {
  mve_pred16_t p = vctp16q (n);
  uint16x8_t va = vldrhq_u16 (a);
  res = vaddvaq_u16 (res, va);
  res = vaddvaq_p_u16 (res, va, p);
  a += 8;
  n -= 8;
    }
  return res;
}

But it shouldn't, this is because there's a lack of handling of across 
vector instructions. Luckily in MVE all across vector instructions 
have the side-effect that they write to a scalar register, even the 
vshlcq instruction (it writes to a scalar carry output).


Added support for them (you were right, there was some special handling 
needed!)





Did this lead me to find an ICE with:

uint16x8_t  test (uint16_t *a, int n)
{
  uint16x8_t res = vdupq_n_u16 (0);
  while (n > 0)
    {
  uint16_t carry = 0;
  mve_pred16_t p = vctp16q (n);
  uint16x8_t va = vldrhq_u16 (a);
  res = vshlcq_u16 (va, , 1);
  res = vshlcq_m_u16 (res, , 1 , p);
  a += 8;
  n -= 8;
    }
  return res;
}

This is because:
+  /* If the USE is outside the loop body bb, or it is inside, 
but

+ is an unpredicated store to memory.  */
+  if (BLOCK_FOR_INSN (insn) != BLOCK_FOR_INSN (next_use_insn)
+ || (arm_mve_vec_insn_is_unpredicated_or_uses_other_predicate
+ (next_use_insn, vctp_vpr_generated)
+    && mve_memory_operand
+    (SET_DEST (single_set (next_use_insn)),
+ GET_MODE (SET_DEST (single_set (next_use_insn))
+    return true;

Assumes single_set doesn't return 0.


Thanks! That is indeed correct.

Corrected this by having a utility function to scan insn operands and 
check against mve_memory_operand that supports any number of 
operands/SETs in the insn




Let's deal with these issues and I'll

Re: [PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops

2023-07-05 Thread Stamatis Markianos-Wright via Gcc-patches
Thank you Andre for reviewing! I'll attach the updated version of the 
patch to the third review email (your final one thus far ;)


On 22/06/2023 16:54, Andre Vieira (lists) wrote:
Some comments below, all quite minor. I'll continue to review 
tomorrow, I need a fresher brain for 
arm_mve_check_df_chain_back_for_implic_predic  ;)


+static int
+arm_mve_get_vctp_lanes (rtx x)
+{
+  if (GET_CODE (x) == SET && GET_CODE (XEXP (x, 1)) == UNSPEC
+  && (XINT (XEXP (x, 1), 1) == VCTP || XINT (XEXP (x, 1), 1) == 
VCTP_M))

+    {
+  switch (GET_MODE (XEXP (x, 1)))
+    {
+  case V16BImode:
+    return 16;
+  case V8BImode:
+    return 8;
+  case V4BImode:
+    return 4;
+  case V2QImode:
+    return 2;
+  default:
+    break;
+    }
+    }
+  return 0;
+}

I think you can replace the switch with something along the lines of:
machine_mode mode = GET_MODE (XEXP (x, 1));
return VECTOR_MODE_P (mode) ? GET_MODE_NUNITS (mode) : 0;


Ah true, especially now that there are no HImode predicates!

I added an additional check of `&& VALID_MVE_PRED_MODE (mode)` as well, 
just to make sure we could never pick up V4SImode, etc. (although I'd 
never expect that to happen if `rtx x` came from a valid instruction)





+/* Check if an insn requires the use of the VPR_REG, if it does, 
return the

+   sub-rtx of the VPR_REG.  The `type` argument controls whether
+   this function should:
+   * For type == 0, check all operands, including the OUT operands,
+ and return the first occurance of the VPR_REG.

s/occurance/occurrence/

Done


+  bool requires_vpr;
+  extract_constrain_insn (insn);

indent of requires_vpr is off.

Done


+  if (type == 1 && (recog_data.operand_type[op] == OP_OUT
+    || recog_data.operand_type[op] == OP_INOUT))
+    continue;
+  else if (type == 2 && (recog_data.operand_type[op] == OP_IN
+ || recog_data.operand_type[op] == OP_INOUT))
+    continue;

Why skip INOUT? I guess this will become clear when I see the uses, 
but I'm wondering whether 'only check the input operands.' is clear 
enough. Maybe 'check operands that are input only.' would be more 
accurate?
Oh! Thanks for spotting this. It also doesn't work with my comment at 
the top:

`(INOUT operands are considered both as input and output operands)`

It's been a long time since I wrote this piece, but it might be that I 
added this after realising that there are no insns with an OP_INOUT VPR 
reg. Since I don't think it's functional, I changed the code to align 
with the comment, instead.




+  /* Fetch the reg_class for each entry and check it against the
+   * VPR_REG reg_class.  */

Remove leading * on the second line.

Damn auto-formatters ;)
Done


+
+/* Wrapper function of arm_get_required_vpr_reg with type == 1, so 
return

+   something only if the VPR reg is an input operand to the insn.  */

When talking about a function parameter in comments capitalize (INSN) 
the name. Same for:

Done


+/* Wrapper function of arm_get_required_vpr_reg with type == 2, so 
return
+   something only if the VPR reg is the retrurn value, an output of, 
or is

+   clobbered by the insn.  */

+/* Return true if an insn is an MVE instruction that VPT-predicable, 
but in
+   its unpredicated form, or if it is predicated, but on a predicate 
other

+   than vpr_reg.  */

In this one also 'is a MVE instruction that is VPT-predicable' would 
be better I think.

Oops, thanks for spotting. Done.



On 15/06/2023 12:47, Stamatis Markianos-Wright via Gcc-patches wrote:
>  Hi all,
>
>  This is the 2/2 patch that contains the functional changes needed
>  for MVE Tail Predicated Low Overhead Loops.  See my previous email
>  for a general introduction of MVE LOLs.
>
>  This support is added through the already existing loop-doloop
>  mechanisms that are used for non-MVE dls/le looping.
>
>  Mid-end changes are:
>
>  1) Relax the loop-doloop mechanism in the mid-end to allow for
> decrement numbers other that -1 and for `count` to be an
> rtx containing a simple REG (which in this case will contain
> the number of elements to be processed), rather
> than an expression for calculating the number of iterations.
>  2) Added a new df utility function: `df_bb_regno_only_def_find` 
that
> will return the DEF of a REG only if it is DEF-ed once 
within the

> basic block.
>
>  And many things in the backend to implement the above 
optimisation:

>
>  3)  Implement the `arm_predict_doloop_p` target hook to 
instruct the

>  mid-end about Low Overhead Loops (MVE or not), as well as
>  `arm_loop_unroll_adjust` which will prevent unrolling of 
any loops
>  that are valid for becoming MVE Tail_Predicated Low 
Overhead Loops
>  (unrolling can transfo

[PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops

2023-06-15 Thread Stamatis Markianos-Wright via Gcc-patches

    Hi all,

    This is the 2/2 patch that contains the functional changes needed
    for MVE Tail Predicated Low Overhead Loops.  See my previous email
    for a general introduction of MVE LOLs.

    This support is added through the already existing loop-doloop
    mechanisms that are used for non-MVE dls/le looping.

    Mid-end changes are:

    1) Relax the loop-doloop mechanism in the mid-end to allow for
   decrement numbers other that -1 and for `count` to be an
   rtx containing a simple REG (which in this case will contain
   the number of elements to be processed), rather
   than an expression for calculating the number of iterations.
    2) Added a new df utility function: `df_bb_regno_only_def_find` that
   will return the DEF of a REG only if it is DEF-ed once within the
   basic block.

    And many things in the backend to implement the above optimisation:

    3)  Implement the `arm_predict_doloop_p` target hook to instruct the
    mid-end about Low Overhead Loops (MVE or not), as well as
    `arm_loop_unroll_adjust` which will prevent unrolling of any loops
    that are valid for becoming MVE Tail_Predicated Low Overhead Loops
    (unrolling can transform a loop in ways that invalidate the dlstp/
    letp tranformation logic and the benefit of the dlstp/letp loop
    would be considerably higher than that of unrolling)
    4)  Appropriate changes to the define_expand of doloop_end, new
    patterns for dlstp and letp, new iterators,  unspecs, etc.
    5) `arm_mve_loop_valid_for_dlstp` and a number of checking functions:
   * `arm_mve_dlstp_check_dec_counter`
   * `arm_mve_dlstp_check_inc_counter`
   * `arm_mve_check_reg_origin_is_num_elems`
   * `arm_mve_check_df_chain_back_for_implic_predic`
   * `arm_mve_check_df_chain_fwd_for_implic_predic_impact`
   This all, in smoe way or another, are running checks on the loop
   structure in order to determine if the loop is valid for dlstp/letp
   transformation.
    6) `arm_attempt_dlstp_transform`: (called from the define_expand of
    doloop_end) this function re-checks for the loop's suitability for
    dlstp/letp transformation and then implements it, if possible.
    7) Various utility functions:
   *`arm_mve_get_vctp_lanes` to map
   from vctp unspecs to number of lanes, and `arm_get_required_vpr_reg`
   to check an insn to see if it requires the VPR or not.
   * `arm_mve_get_loop_vctp`
   * `arm_mve_get_vctp_lanes`
   * `arm_emit_mve_unpredicated_insn_to_seq`
   * `arm_get_required_vpr_reg`
   * `arm_get_required_vpr_reg_param`
   * `arm_get_required_vpr_reg_ret_val`
   * `arm_mve_vec_insn_is_predicated_with_this_predicate`
   * `arm_mve_vec_insn_is_unpredicated_or_uses_other_predicate`

    No regressions on arm-none-eabi with various targets and on
    aarch64-none-elf. Thoughts on getting this into trunk?

    Thank you,
    Stam Markianos-Wright

    gcc/ChangeLog:

    * config/arm/arm-protos.h (arm_target_insn_ok_for_lob): 
Rename to...

    (arm_target_bb_ok_for_lob): ...this
    (arm_attempt_dlstp_transform): New.
    * config/arm/arm.cc (TARGET_LOOP_UNROLL_ADJUST): New.
    (TARGET_PREDICT_DOLOOP_P): New.
    (arm_block_set_vect):
    (arm_target_insn_ok_for_lob): Rename from 
arm_target_insn_ok_for_lob.

    (arm_target_bb_ok_for_lob): New.
    (arm_mve_get_vctp_lanes): New.
    (arm_get_required_vpr_reg): New.
    (arm_get_required_vpr_reg_param): New.
    (arm_get_required_vpr_reg_ret_val): New.
    (arm_mve_get_loop_vctp): New.
(arm_mve_vec_insn_is_unpredicated_or_uses_other_predicate): New.
    (arm_mve_vec_insn_is_predicated_with_this_predicate): New.
    (arm_mve_check_df_chain_back_for_implic_predic): New.
    (arm_mve_check_df_chain_fwd_for_implic_predic_impact): New.
    (arm_mve_check_reg_origin_is_num_elems): New.
    (arm_mve_dlstp_check_inc_counter): New.
    (arm_mve_dlstp_check_dec_counter): New.
    (arm_mve_loop_valid_for_dlstp): New.
    (arm_predict_doloop_p): New.
    (arm_loop_unroll_adjust): New.
    (arm_emit_mve_unpredicated_insn_to_seq): New.
    (arm_attempt_dlstp_transform): New.
    * config/arm/iterators.md (DLSTP): New.
    (mode1): Add DLSTP mappings.
    * config/arm/mve.md (*predicated_doloop_end_internal): New.
    (dlstp_insn): New.
    * config/arm/thumb2.md (doloop_end): Update for MVE LOLs.
    * config/arm/unspecs.md: New unspecs.
    * df-core.cc (df_bb_regno_only_def_find): New.
    * df.h (df_bb_regno_only_def_find): New.
    * loop-doloop.cc (doloop_condition_get): Relax conditions.
    (doloop_optimize): Add support for elementwise LoLs.

    gcc/testsuite/ChangeLog:

    * 

Re: [PATCH] [arm] testsuite: make mve_intrinsic_type_overloads-int.c libc-agnostic

2023-05-23 Thread Stamatis Markianos-Wright via Gcc-patches



On 23/05/2023 15:41, Christophe Lyon wrote:

Glibc defines int32_t as 'int' while newlib defines it as 'long int'.

Although these correspond to the same size, g++ complains when using the




   'wrong' version:
   invalid conversion from 'long int*' to 'int32_t*' {aka 'int*'} [-fpermissive]
or
   invalid conversion from 'int*' to 'int32_t*' {aka 'long int*'} [-fpermissive]

when calling vst1q(int32*, int32x4_t) with a first parameter of type
'long int *' (resp. 'int *')

To make this test pass with any type of toolchain, this patch defines
'word_type' according to which libc is in use.


Thank you for spotting this! I think this fix is needed on all of 
GCC12,13,trunk btw (it should apply cleanly)





2023-05-23  Christophe Lyon  

gcc/testsuite/
* gcc.target/arm/mve/intrinsics/mve_intrinsic_type_overloads-int.c:
Support both definitions of int32_t.
---
  .../mve_intrinsic_type_overloads-int.c| 28 ++-
  1 file changed, 15 insertions(+), 13 deletions(-)

diff --git 
a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_intrinsic_type_overloads-int.c
 
b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_intrinsic_type_overloads-int.c
index 7947dc024bc..ab51cc8b323 100644
--- 
a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_intrinsic_type_overloads-int.c
+++ 
b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_intrinsic_type_overloads-int.c
@@ -47,14 +47,22 @@ foo2 (short * addr, int16x8_t value)
vst1q (addr, value);
  }
  
-void

-foo3 (int * addr, int32x4_t value)
-{
-  vst1q (addr, value); /* { dg-warning "invalid conversion" "" { target c++ } 
} */
-}
+/* Glibc defines int32_t as 'int' while newlib defines it as 'long int'.
+
+   Although these correspond to the same size, g++ complains when using the
+   'wrong' version:
+  invalid conversion from 'long int*' to 'int32_t*' {aka 'int*'} [-fpermissive]
+
+  The trick below is to make this test pass whether using glibc-based or
+  newlib-based toolchains.  */
  
+#if defined(__GLIBC__)

+#define word_type int
+#else
+#define word_type long int
+#endif
  void
-foo4 (long * addr, int32x4_t value)
+foo3 (word_type * addr, int32x4_t value)
  {
vst1q (addr, value);
  }
@@ -78,13 +86,7 @@ foo7 (unsigned short * addr, uint16x8_t value)
  }
  
  void

-foo8 (unsigned int * addr, uint32x4_t value)
-{
-  vst1q (addr, value); /* { dg-warning "invalid conversion" "" { target c++ } 
} */
-}
-
-void
-foo9 (unsigned long * addr, uint32x4_t value)
+foo8 (unsigned word_type * addr, uint32x4_t value)
  {
vst1q (addr, value);
  }


[GCC12 backport] arm: MVE testsuite and backend bugfixes

2023-05-17 Thread Stamatis Markianos-Wright via Gcc-patches



On 17/05/2023 10:26, Kyrylo Tkachov wrote:

Hi Stam,


-Original Message-
From: Stam Markianos-Wright 
Sent: Tuesday, May 16, 2023 2:32 PM
To: gcc-patches@gcc.gnu.org
Cc: Kyrylo Tkachov ; Richard Earnshaw
; Andrea Corallo 
Subject: [GCC12 backport] arm: MVE testsuite and backend bugfixes

Hi all,

We've recently sent up a lot of patches overhauling the testsuite of the
Arm MVE backend.
With these changes, we've also identified and fixed a number of bugs
(some backend bugs and many to do with the polymorphism of intrinsics in
MVE the header file).
These would all be relevant to backport to GCC12.
The list is as follows (in the order they all apply on top of eachother):

* This patch series:
https://gcc.gnu.org/pipermail/gcc-patches/2022-November/606552.html
(commits 9a79b522e0663a202a288db56ebcbdcdb48bdaca to
f2b54e5b796b00f0072b61f9cd6a964c66ead29b)
* ecc363971aeac52481d92de8b37521f6cc2d38e6 arm: Fix MVE testsuite
fallouts
* 06aa66af7d0dacc1b247d9e38175e789ef159191 arm: Add missing early
clobber to MVE vrev64q_m patterns
* c09663eabfb84ac56ddd8d44abcab3f4902c83bd testsuite: [arm] Relax
expected register names in MVE tests
* 330d665ce6dcc63ed0bd78d807e69bbfc55255b6 arm: [MVE] Add missing
length=8 attribute
* 8d4f007398bc3f8fea812fb8cff4d7d0556d12f1 arm: fix mve intrinsics scan
body tests for C++
* This patch series
https://gcc.gnu.org/pipermail/gcc-patches/2023-January/610312.html
(commits dd4424ef898608321b60610c4f3c98737ace3680 to
267f01a493ab8a0bec9325ce3386b946c46f2e98)
* 8a1360e72d6c6056606aa5edd8c906c50f26de59 arm: Split up MVE _Generic
associations to prevent type clashes [PR107515]
* 3f0ca7a3e4431534bff3b8eb73709cc822e489b0 arm: Fix vcreate definition
* c1093923733a1072a237f112e3239b5ebd88eadd arm: Make MVE masked
stores
read memory operand [PR 108177]
* f54e31ddefe3ea7146624eabcb75b1c90dc59f1a arm: fix __arm_vld1q_z*
and
__arm_vst1q_p* intrinsics [PR108442]
* 1d509f190393627cdf0afffc427b25dd21c2 arm: remove unused variables
from test


Ok to backport.


-- up to this point everything applied cleanly. The final two need minor
rebasing changes --

* This patch series:
https://gcc.gnu.org/pipermail/gcc-patches/2023-April/617008.html (Not
pushed to trunk yet, but has been approved. For trunk we do now need to
resolve some merge conflicts, since Christophe has started merging the
MVE Intrinsic Restructuring, but these are trivial. I will also backport
to GCC13 where this patch series applies cleanly)
* cfa118fc089e38a94ec60ccf5b667aea015e5f60 [arm] complete vmsr/vmrs
blank and case adjustments.

The final one is a commit from Alexandre Oliva that is needed to ensure
that we don't accidentally regress the test due to the tabs vs spaces
and capitalisation on the vmrs/vmsr instructions :)

After all that, no regressions on baremetal arm-none-eabi in a bunch
configurations (-marm, thumb1, thumb2, MVE, MVE.FP, softfp and hardfp):


Will you be sending these to the list after adjusting?


Yep, I believe we have to!

I'm thinking we should do one batch of [committed] emails for GCC12 and 
one for trunk.


For GCC13 the previously sent version of the series at 
https://gcc.gnu.org/pipermail/gcc-patches/2023-May/617373.html applies 
cleanly. Let me know if there's anything further we need to do!


Thanks,
Stamatis



Thanks,
Kyrill


Thanks,
Stam


[GCC12 backport] arm: MVE testsuite and backend bugfixes

2023-05-16 Thread Stamatis Markianos-Wright via Gcc-patches

Hi all,

We've recently sent up a lot of patches overhauling the testsuite of the 
Arm MVE backend.
With these changes, we've also identified and fixed a number of bugs 
(some backend bugs and many to do with the polymorphism of intrinsics in 
MVE the header file).

These would all be relevant to backport to GCC12.
The list is as follows (in the order they all apply on top of eachother):

* This patch series: 
https://gcc.gnu.org/pipermail/gcc-patches/2022-November/606552.html 
(commits 9a79b522e0663a202a288db56ebcbdcdb48bdaca to 
f2b54e5b796b00f0072b61f9cd6a964c66ead29b)

* ecc363971aeac52481d92de8b37521f6cc2d38e6 arm: Fix MVE testsuite fallouts
* 06aa66af7d0dacc1b247d9e38175e789ef159191 arm: Add missing early 
clobber to MVE vrev64q_m patterns
* c09663eabfb84ac56ddd8d44abcab3f4902c83bd testsuite: [arm] Relax 
expected register names in MVE tests
* 330d665ce6dcc63ed0bd78d807e69bbfc55255b6 arm: [MVE] Add missing 
length=8 attribute
* 8d4f007398bc3f8fea812fb8cff4d7d0556d12f1 arm: fix mve intrinsics scan 
body tests for C++
* This patch series 
https://gcc.gnu.org/pipermail/gcc-patches/2023-January/610312.html 
(commits dd4424ef898608321b60610c4f3c98737ace3680 to 
267f01a493ab8a0bec9325ce3386b946c46f2e98)
* 8a1360e72d6c6056606aa5edd8c906c50f26de59 arm: Split up MVE _Generic 
associations to prevent type clashes [PR107515]

* 3f0ca7a3e4431534bff3b8eb73709cc822e489b0 arm: Fix vcreate definition
* c1093923733a1072a237f112e3239b5ebd88eadd arm: Make MVE masked stores 
read memory operand [PR 108177]
* f54e31ddefe3ea7146624eabcb75b1c90dc59f1a arm: fix __arm_vld1q_z* and 
__arm_vst1q_p* intrinsics [PR108442]
* 1d509f190393627cdf0afffc427b25dd21c2 arm: remove unused variables 
from test


-- up to this point everything applied cleanly. The final two need minor 
rebasing changes --


* This patch series: 
https://gcc.gnu.org/pipermail/gcc-patches/2023-April/617008.html (Not 
pushed to trunk yet, but has been approved. For trunk we do now need to 
resolve some merge conflicts, since Christophe has started merging the 
MVE Intrinsic Restructuring, but these are trivial. I will also backport 
to GCC13 where this patch series applies cleanly)
* cfa118fc089e38a94ec60ccf5b667aea015e5f60 [arm] complete vmsr/vmrs 
blank and case adjustments.


The final one is a commit from Alexandre Oliva that is needed to ensure 
that we don't accidentally regress the test due to the tabs vs spaces 
and capitalisation on the vmrs/vmsr instructions :)


After all that, no regressions on baremetal arm-none-eabi in a bunch 
configurations (-marm, thumb1, thumb2, MVE, MVE.FP, softfp and hardfp):


Thanks,
Stam



[PATCH 10/10] arm testsuite: Shifts and get_FPSCR ACLE optimisation fixes

2023-05-03 Thread Stamatis Markianos-Wright via Gcc-patches

Hi Kyrill,

On 28/04/2023 17:58, Kyrylo Tkachov wrote:



-Original Message-
From: Andrea Corallo 
Sent: Friday, April 28, 2023 12:30 PM
To: gcc-patches@gcc.gnu.org
Cc: Kyrylo Tkachov ; Richard Earnshaw
; Stam Markianos-Wright 
Subject: [PATCH 10/10] arm testsuite: Shifts and get_FPSCR ACLE optimisation
fixes

From: Stam Markianos-Wright 

These newly updated tests were rewritten by Andrea. Some of them
needed further manual fixing as follows:

* The #shift immediate value not in the check-function-bodies as expected
* Some shifts getting optimised to mov immediates, e.g.
   `uqshll (1, 1);` -> movsr0, #2; movsr1, #0

Shouldn't this test be testing something that cannot be constant-folded away? 
i.e. have non-constant arguments?
I think we should have conformance tests first and foremost, and follow-up 
tests for such optimisations should be (welcome) added separately.


Ahh, good point! I think in that case I've removed these checks
from here and put them into a new test (it's a bit trivial but I
couldn't find anywhere else where we doing this check with MVE
instructions)


Also, since this patch is the last one in this series, would the
series be Ok for backporting to GCC13?

Thank you!
Stam




* The ACLE was specifying sub-optimal code: lsr+and instead of ubfx. In
   this case the test rewritten from the ACLE had the lsr+and pattern,
   but the compiler was able to optimise to ubfx. Hence I've changed the
   test to now match on ubfx.

That looks ok.
Thanks,
Kyrill


gcc/testsuite/ChangeLog:

   * gcc.target/arm/mve/intrinsics/srshr.c: Update shift value.
   * gcc.target/arm/mve/intrinsics/srshrl.c: Update shift value.
   * gcc.target/arm/mve/intrinsics/uqshl.c: Update shift value and mov
imm.
   * gcc.target/arm/mve/intrinsics/uqshll.c: Update shift value and mov
imm.
   * gcc.target/arm/mve/intrinsics/urshr.c: Update shift value.
   * gcc.target/arm/mve/intrinsics/urshrl.c: Update shift value.
   * gcc.target/arm/mve/intrinsics/vadciq_m_s32.c: Update to ubfx.
   * gcc.target/arm/mve/intrinsics/vadciq_m_u32.c: Update to ubfx.
   * gcc.target/arm/mve/intrinsics/vadciq_s32.c: Update to ubfx.
   * gcc.target/arm/mve/intrinsics/vadciq_u32.c: Update to ubfx.
   * gcc.target/arm/mve/intrinsics/vadcq_m_s32.c: Update to ubfx.
   * gcc.target/arm/mve/intrinsics/vadcq_m_u32.c: Update to ubfx.
   * gcc.target/arm/mve/intrinsics/vadcq_s32.c: Update to ubfx.
   * gcc.target/arm/mve/intrinsics/vadcq_u32.c: Update to ubfx.
   * gcc.target/arm/mve/intrinsics/vsbciq_m_s32.c: Update to ubfx.
   * gcc.target/arm/mve/intrinsics/vsbciq_m_u32.c: Update to ubfx.
   * gcc.target/arm/mve/intrinsics/vsbciq_s32.c: Update to ubfx.
   * gcc.target/arm/mve/intrinsics/vsbciq_u32.c: Update to ubfx.
   * gcc.target/arm/mve/intrinsics/vsbcq_m_s32.c: Update to ubfx.
   * gcc.target/arm/mve/intrinsics/vsbcq_m_u32.c: Update to ubfx.
   * gcc.target/arm/mve/intrinsics/vsbcq_s32.c: Update to ubfx.
   * gcc.target/arm/mve/intrinsics/vsbcq_u32.c: Update to ubfx.
---
  gcc/testsuite/gcc.target/arm/mve/intrinsics/srshr.c   | 2 +-
  gcc/testsuite/gcc.target/arm/mve/intrinsics/srshrl.c  | 2 +-
  gcc/testsuite/gcc.target/arm/mve/intrinsics/uqshl.c   | 4 ++--
  gcc/testsuite/gcc.target/arm/mve/intrinsics/uqshll.c  | 5 +++--
  gcc/testsuite/gcc.target/arm/mve/intrinsics/urshr.c   | 4 ++--
  gcc/testsuite/gcc.target/arm/mve/intrinsics/urshrl.c  | 4 ++--
  .../gcc.target/arm/mve/intrinsics/vadciq_m_s32.c  | 8 ++--
  .../gcc.target/arm/mve/intrinsics/vadciq_m_u32.c  | 8 ++--
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vadciq_s32.c  | 8 ++--
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vadciq_u32.c  | 8 ++--
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vadcq_m_s32.c | 8 ++--
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vadcq_m_u32.c | 8 ++--
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vadcq_s32.c   | 8 ++--
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vadcq_u32.c   | 8 ++--
  .../gcc.target/arm/mve/intrinsics/vsbciq_m_s32.c  | 8 ++--
  .../gcc.target/arm/mve/intrinsics/vsbciq_m_u32.c  | 8 ++--
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vsbciq_s32.c  | 8 ++--
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vsbciq_u32.c  | 8 ++--
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vsbcq_m_s32.c | 8 ++--
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vsbcq_m_u32.c | 8 ++--
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vsbcq_s32.c   | 8 ++--
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vsbcq_u32.c   | 8 ++--
  22 files changed, 43 insertions(+), 106 deletions(-)

diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/srshr.c
b/gcc/testsuite/gcc.target/arm/mve/intrinsics/srshr.c
index 94e3f42fd33..734375d58c0 100644
--- a/gcc/testsuite/gcc.target/arm/mve/intrinsics/srshr.c
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/srshr.c

Re: [PATCH 04/10] arm: Stop vadcq, vsbcq intrinsics from overwriting the FPSCR NZ flags

2023-05-03 Thread Stamatis Markianos-Wright via Gcc-patches


On 28/04/2023 17:45, Kyrylo Tkachov wrote:

Hi Andrea, Stam,


-Original Message-
From: Andrea Corallo 
Sent: Friday, April 28, 2023 12:30 PM
To: gcc-patches@gcc.gnu.org
Cc: Kyrylo Tkachov ; Richard Earnshaw
; Stam Markianos-Wright 
Subject: [PATCH 04/10] arm: Stop vadcq, vsbcq intrinsics from overwriting the
FPSCR NZ flags

From: Stam Markianos-Wright 

Hi all,

We noticed that calls to the vadcq and vsbcq intrinsics, both of
which use __builtin_arm_set_fpscr_nzcvqc to set the Carry flag in
the FPSCR, would produce the following code:

```
< r2 is the *carry input >
vmrs  r3, FPSCR_nzcvqc
bic   r3, r3, #536870912
orr   r3, r3, r2, lsl #29
vmsr  FPSCR_nzcvqc, r3
```

when the MVE ACLE instead gives a different instruction sequence of:
```
< Rt is the *carry input >
VMRS Rs,FPSCR_nzcvqc
BFI Rs,Rt,#29,#1
VMSR FPSCR_nzcvqc,Rs
```

the bic + orr pair is slower and it's also wrong, because, if the
*carry input is greater than 1, then we risk overwriting the top two
bits of the FPSCR register (the N and Z flags).

This turned out to be a problem in the header file and the solution was
to simply add a `& 1x0u` to the `*carry` input: then the compiler knows
that we only care about the lowest bit and can optimise to a BFI.

Ok for trunk?

Ok, but I think this needs testsuite coverage for the bug?
Thanks,
Kyrill


So this can be seen in the new vadcq* , vsbcq* tests:

**    ...
**    vmrs    (?:ip|fp|r[0-9]+), FPSCR_nzcvqc(?:    @.*|)
**    ...
**    bfi    (?:ip|fp|r[0-9]+), (?:ip|fp|r[0-9]+), #29, #1(?: @.*|)
**    ...
**    vmsr    FPSCR_nzcvqc, (?:ip|fp|r[0-9]+)(?:    @.*|)
**    ...

The fact that there's a BFI there rather than the BIC + ORR shows
that this has now been optimised by the compiler and the bug isn't
present in those intrinsics any longer... Sorry, I should have linked
that in better in our patch series!

Added a runtest, also, as it was fairly trivial to write it out :)

Thanks,
Stam




Thanks,
Stam Markianos-Wright

gcc/ChangeLog:

   * config/arm/arm_mve.h (__arm_vadcq_s32): Fix arithmetic.
   (__arm_vadcq_u32): Likewise.
   (__arm_vadcq_m_s32): Likewise.
   (__arm_vadcq_m_u32): Likewise.
   (__arm_vsbcq_s32): Likewise.
   (__arm_vsbcq_u32): Likewise.
   (__arm_vsbcq_m_s32): Likewise.
   (__arm_vsbcq_m_u32): Likewise.
---
  gcc/config/arm/arm_mve.h | 16 
  1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h
index 1262d668121..8778216304b 100644
--- a/gcc/config/arm/arm_mve.h
+++ b/gcc/config/arm/arm_mve.h
@@ -16055,7 +16055,7 @@ __extension__ extern __inline int32x4_t
  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
  __arm_vadcq_s32 (int32x4_t __a, int32x4_t __b, unsigned * __carry)
  {
-  __builtin_arm_set_fpscr_nzcvqc((__builtin_arm_get_fpscr_nzcvqc () &
~0x2000u) | (*__carry << 29));
+  __builtin_arm_set_fpscr_nzcvqc((__builtin_arm_get_fpscr_nzcvqc () &
~0x2000u) | ((*__carry & 0x1u) << 29));
int32x4_t __res = __builtin_mve_vadcq_sv4si (__a, __b);
*__carry = (__builtin_arm_get_fpscr_nzcvqc () >> 29) & 0x1u;
return __res;
@@ -16065,7 +16065,7 @@ __extension__ extern __inline uint32x4_t
  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
  __arm_vadcq_u32 (uint32x4_t __a, uint32x4_t __b, unsigned * __carry)
  {
-  __builtin_arm_set_fpscr_nzcvqc((__builtin_arm_get_fpscr_nzcvqc () &
~0x2000u) | (*__carry << 29));
+  __builtin_arm_set_fpscr_nzcvqc((__builtin_arm_get_fpscr_nzcvqc () &
~0x2000u) | ((*__carry & 0x1u) << 29));
uint32x4_t __res = __builtin_mve_vadcq_uv4si (__a, __b);
*__carry = (__builtin_arm_get_fpscr_nzcvqc () >> 29) & 0x1u;
return __res;
@@ -16075,7 +16075,7 @@ __extension__ extern __inline int32x4_t
  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
  __arm_vadcq_m_s32 (int32x4_t __inactive, int32x4_t __a, int32x4_t __b,
unsigned * __carry, mve_pred16_t __p)
  {
-  __builtin_arm_set_fpscr_nzcvqc((__builtin_arm_get_fpscr_nzcvqc () &
~0x2000u) | (*__carry << 29));
+  __builtin_arm_set_fpscr_nzcvqc((__builtin_arm_get_fpscr_nzcvqc () &
~0x2000u) | ((*__carry & 0x1u) << 29));
int32x4_t __res = __builtin_mve_vadcq_m_sv4si (__inactive, __a, __b, __p);
*__carry = (__builtin_arm_get_fpscr_nzcvqc () >> 29) & 0x1u;
return __res;
@@ -16085,7 +16085,7 @@ __extension__ extern __inline uint32x4_t
  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
  __arm_vadcq_m_u32 (uint32x4_t __inactive, uint32x4_t __a, uint32x4_t __b,
unsigned * __carry, mve_pred16_t __p)
  {
-  __builtin_arm_set_fpscr_nzcvqc((__builtin_arm_get_fpscr_nzcvqc () &
~0x2000u) | (*__carry << 29));
+  __builtin_arm_set_fpscr_nzcvqc((__builtin_arm_get_fpscr_nzcvqc () &
~0x2000u) | ((*__carry & 0x1u) << 29));
uint32x4_t __res =  __builtin_mve_vadcq_m_uv4si (__inactive, __a, __b,
__p);
*__carry = (__builtin_arm_get_fpscr_nzcvqc () >> 29) & 0x1u;
return __res;

Re: [PATCH 09/10] arm testsuite: XFAIL or relax registers in some tests

2023-05-02 Thread Stamatis Markianos-Wright via Gcc-patches



On 28/04/2023 17:54, Kyrylo Tkachov wrote:



-Original Message-
From: Andrea Corallo 
Sent: Friday, April 28, 2023 12:30 PM
To: gcc-patches@gcc.gnu.org
Cc: Kyrylo Tkachov ; Richard Earnshaw
; Stam Markianos-Wright 
Subject: [PATCH 09/10] arm testsuite: XFAIL or relax registers in some tests

From: Stam Markianos-Wright 

Hi all,

This is a simple testsuite tidy-up patch, addressing to types of errors:

* The vcmp vector-scalar tests failing due to the compiler's preference
of vector-vector comparisons, over vector-scalar comparisons. This is
due to the lack of cost model for MVE and the compiler not knowing that
the RTL vec_duplicate is free in those instructions. For now, we simply
XFAIL these checks.

I'd like to see this deficiency tracked in Bugzilla before we mark these as 
XFAIL.


Yep! Raised https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109697
(And I'll also update this commit message to reference that PR now)




* The tests for pr108177 had strict usage of q0 and r0 registers,
meaning that they would FAIL with -mfloat-abi=softf. The register checks
have now been relaxed.

This part is ok.
Thanks,
Kyrill


gcc/testsuite/ChangeLog:

   * gcc.target/arm/mve/intrinsics/srshr.c: XFAIL check.
   * gcc.target/arm/mve/intrinsics/srshrl.c: XFAIL check.
   * gcc.target/arm/mve/intrinsics/uqshl.c: XFAIL check.
   * gcc.target/arm/mve/intrinsics/uqshll.c: XFAIL check.
   * gcc.target/arm/mve/intrinsics/urshr.c: XFAIL check.
   * gcc.target/arm/mve/intrinsics/urshrl.c: XFAIL check.
   * gcc.target/arm/mve/intrinsics/vadciq_m_s32.c: XFAIL check.
   * gcc.target/arm/mve/intrinsics/vadciq_m_u32.c: XFAIL check.
   * gcc.target/arm/mve/intrinsics/vadciq_s32.c: XFAIL check.
   * gcc.target/arm/mve/intrinsics/vadciq_u32.c: XFAIL check.
   * gcc.target/arm/mve/intrinsics/vadcq_m_s32.c: XFAIL check.
   * gcc.target/arm/mve/intrinsics/vadcq_m_u32.c: XFAIL check.
   * gcc.target/arm/mve/intrinsics/vadcq_s32.c: XFAIL check.
   * gcc.target/arm/mve/intrinsics/vadcq_u32.c: XFAIL check.
   * gcc.target/arm/mve/intrinsics/vsbciq_m_s32.c: XFAIL check.
   * gcc.target/arm/mve/intrinsics/vsbciq_m_u32.c: XFAIL check.
   * gcc.target/arm/mve/intrinsics/vsbciq_s32.c: XFAIL check.
   * gcc.target/arm/mve/intrinsics/vsbciq_u32.c: XFAIL check.
   * gcc.target/arm/mve/intrinsics/vsbcq_m_s32.c: XFAIL check.
   * gcc.target/arm/mve/intrinsics/vsbcq_m_u32.c: XFAIL check.
   * gcc.target/arm/mve/intrinsics/vsbcq_s32.c: XFAIL check.
   * gcc.target/arm/mve/intrinsics/vsbcq_u32.c: XFAIL check.
   * gcc.target/arm/mve/pr108177-1.c: Relax registers.
   * gcc.target/arm/mve/pr108177-10.c: Relax registers.
   * gcc.target/arm/mve/pr108177-11.c: Relax registers.
   * gcc.target/arm/mve/pr108177-12.c: Relax registers.
   * gcc.target/arm/mve/pr108177-13.c: Relax registers.
   * gcc.target/arm/mve/pr108177-14.c: Relax registers.
   * gcc.target/arm/mve/pr108177-2.c: Relax registers.
   * gcc.target/arm/mve/pr108177-3.c: Relax registers.
   * gcc.target/arm/mve/pr108177-4.c: Relax registers.
   * gcc.target/arm/mve/pr108177-5.c: Relax registers.
   * gcc.target/arm/mve/pr108177-6.c: Relax registers.
   * gcc.target/arm/mve/pr108177-7.c: Relax registers.
   * gcc.target/arm/mve/pr108177-8.c: Relax registers.
   * gcc.target/arm/mve/pr108177-9.c: Relax registers.
---
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vcmpcsq_n_u16.c | 2 +-
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vcmpcsq_n_u32.c | 2 +-
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vcmpcsq_n_u8.c  | 2 +-
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vcmpeqq_n_f16.c | 2 +-
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vcmpeqq_n_f32.c | 2 +-
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vcmpeqq_n_u16.c | 2 +-
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vcmpeqq_n_u32.c | 2 +-
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vcmpeqq_n_u8.c  | 2 +-
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vcmpgeq_n_f16.c | 2 +-
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vcmpgeq_n_f32.c | 2 +-
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vcmpgtq_n_f16.c | 2 +-
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vcmpgtq_n_f32.c | 2 +-
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vcmphiq_n_u16.c | 2 +-
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vcmphiq_n_u32.c | 2 +-
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vcmphiq_n_u8.c  | 2 +-
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vcmpleq_n_f16.c | 2 +-
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vcmpleq_n_f32.c | 2 +-
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vcmpltq_n_f16.c | 2 +-
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vcmpltq_n_f32.c | 2 +-
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vcmpneq_n_f16.c | 2 +-
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vcmpneq_n_f32.c | 2 +-
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vcmpneq_n_u16.c | 2 +-
  

[PATCH 09/10] arm testsuite: XFAIL or relax registers in some tests

2023-05-02 Thread Stamatis Markianos-Wright via Gcc-patches



On 02/05/2023 09:28, Christophe Lyon wrote:

Hi Stam!


On 4/28/23 13:30, Andrea Corallo via Gcc-patches wrote:

From: Stam Markianos-Wright 

Hi all,

This is a simple testsuite tidy-up patch, addressing to types of errors:

* The vcmp vector-scalar tests failing due to the compiler's preference
of vector-vector comparisons, over vector-scalar comparisons. This is
due to the lack of cost model for MVE and the compiler not knowing that
the RTL vec_duplicate is free in those instructions. For now, we simply
XFAIL these checks.
* The tests for pr108177 had strict usage of q0 and r0 registers,
meaning that they would FAIL with -mfloat-abi=softf. The register checks

Very minor typo: should be "softfp" :-)

Ahh indeed, thanks! Will change this before pushing


Thanks,

Christophe


have now been relaxed.

gcc/testsuite/ChangeLog:

* gcc.target/arm/mve/intrinsics/srshr.c: XFAIL check.
* gcc.target/arm/mve/intrinsics/srshrl.c: XFAIL check.
* gcc.target/arm/mve/intrinsics/uqshl.c: XFAIL check.
* gcc.target/arm/mve/intrinsics/uqshll.c: XFAIL check.
* gcc.target/arm/mve/intrinsics/urshr.c: XFAIL check.
* gcc.target/arm/mve/intrinsics/urshrl.c: XFAIL check.
* gcc.target/arm/mve/intrinsics/vadciq_m_s32.c: XFAIL check.
* gcc.target/arm/mve/intrinsics/vadciq_m_u32.c: XFAIL check.
* gcc.target/arm/mve/intrinsics/vadciq_s32.c: XFAIL check.
* gcc.target/arm/mve/intrinsics/vadciq_u32.c: XFAIL check.
* gcc.target/arm/mve/intrinsics/vadcq_m_s32.c: XFAIL check.
* gcc.target/arm/mve/intrinsics/vadcq_m_u32.c: XFAIL check.
* gcc.target/arm/mve/intrinsics/vadcq_s32.c: XFAIL check.
* gcc.target/arm/mve/intrinsics/vadcq_u32.c: XFAIL check.
* gcc.target/arm/mve/intrinsics/vsbciq_m_s32.c: XFAIL check.
* gcc.target/arm/mve/intrinsics/vsbciq_m_u32.c: XFAIL check.
* gcc.target/arm/mve/intrinsics/vsbciq_s32.c: XFAIL check.
* gcc.target/arm/mve/intrinsics/vsbciq_u32.c: XFAIL check.
* gcc.target/arm/mve/intrinsics/vsbcq_m_s32.c: XFAIL check.
* gcc.target/arm/mve/intrinsics/vsbcq_m_u32.c: XFAIL check.
* gcc.target/arm/mve/intrinsics/vsbcq_s32.c: XFAIL check.
* gcc.target/arm/mve/intrinsics/vsbcq_u32.c: XFAIL check.
* gcc.target/arm/mve/pr108177-1.c: Relax registers.
* gcc.target/arm/mve/pr108177-10.c: Relax registers.
* gcc.target/arm/mve/pr108177-11.c: Relax registers.
* gcc.target/arm/mve/pr108177-12.c: Relax registers.
* gcc.target/arm/mve/pr108177-13.c: Relax registers.
* gcc.target/arm/mve/pr108177-14.c: Relax registers.
* gcc.target/arm/mve/pr108177-2.c: Relax registers.
* gcc.target/arm/mve/pr108177-3.c: Relax registers.
* gcc.target/arm/mve/pr108177-4.c: Relax registers.
* gcc.target/arm/mve/pr108177-5.c: Relax registers.
* gcc.target/arm/mve/pr108177-6.c: Relax registers.
* gcc.target/arm/mve/pr108177-7.c: Relax registers.
* gcc.target/arm/mve/pr108177-8.c: Relax registers.
* gcc.target/arm/mve/pr108177-9.c: Relax registers.
---
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vcmpcsq_n_u16.c | 2 +-
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vcmpcsq_n_u32.c | 2 +-
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vcmpcsq_n_u8.c  | 2 +-
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vcmpeqq_n_f16.c | 2 +-
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vcmpeqq_n_f32.c | 2 +-
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vcmpeqq_n_u16.c | 2 +-
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vcmpeqq_n_u32.c | 2 +-
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vcmpeqq_n_u8.c  | 2 +-
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vcmpgeq_n_f16.c | 2 +-
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vcmpgeq_n_f32.c | 2 +-
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vcmpgtq_n_f16.c | 2 +-
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vcmpgtq_n_f32.c | 2 +-
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vcmphiq_n_u16.c | 2 +-
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vcmphiq_n_u32.c | 2 +-
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vcmphiq_n_u8.c  | 2 +-
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vcmpleq_n_f16.c | 2 +-
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vcmpleq_n_f32.c | 2 +-
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vcmpltq_n_f16.c | 2 +-
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vcmpltq_n_f32.c | 2 +-
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vcmpneq_n_f16.c | 2 +-
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vcmpneq_n_f32.c | 2 +-
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vcmpneq_n_u16.c | 2 +-
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vcmpneq_n_u32.c | 2 +-
  gcc/testsuite/gcc.target/arm/mve/intrinsics/vcmpneq_n_u8.c  | 2 +-
  gcc/testsuite/gcc.target/arm/mve/pr108177-1.c   | 4 ++--
  gcc/testsuite/gcc.target/arm/mve/pr108177-10.c  | 4 ++--
  gcc/testsuite/gcc.target/arm/mve/pr108177-11.c  | 4 ++--
  gcc/testsuite/gcc.target/arm/mve/pr108177-12.c  | 4 ++--
  gcc/testsuite/gcc.target/arm/mve/pr108177-13.c 

[committed][testsuite] arm: remove unused variables from test

2023-04-06 Thread Stamatis Markianos-Wright via Gcc-patches

Hi all,

This is just a minor issue I found with a previous test
of mine that caused it to fail in C++ mode due to these
unused const variables being uninitialised. I forgot to
remove these after removing some test cases that did use
them.
I removed the test cases, because I came to the
conclusion that the const-ness of the immediate was
irrelevant to the test itself.
Removing the variables now makes the test PASS.
Committed as Obvious.

gcc/testsuite/ChangeLog:

    * 
gcc.target/arm/mve/intrinsics/mve_intrinsic_type_overloads-fp.c: Remove 
unused variables.
    * 
gcc.target/arm/mve/intrinsics/mve_intrinsic_type_overloads-int.c: Remove 
unused variables.




 Inline diff of patch 

diff --git 
a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_intrinsic_type_overloads-fp.c 
b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_intrinsic_type_overloads-fp.c

index 7492e9b22bd..a2787a47859 100644
--- 
a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_intrinsic_type_overloads-fp.c
+++ 
b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_intrinsic_type_overloads-fp.c

@@ -19,15 +19,6 @@ int16_t i6;
 int32_t i7;
 int64_t i8;

-const int ci1;
-const short ci2;
-const long ci3;
-const long long ci4;
-const int8_t ci5;
-const int16_t ci6;
-const int32_t ci7;
-const int64_t ci8;
-
 float16x8_t floatvec;
 int16x8_t intvec;

diff --git 
a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_intrinsic_type_overloads-int.c 
b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_intrinsic_type_overloads-int.c

index 9a921bf40e8..7b88f462e17 100644
--- 
a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_intrinsic_type_overloads-int.c
+++ 
b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_intrinsic_type_overloads-int.c

@@ -13,15 +13,6 @@ int16_t i6;
 int32_t i7;
 int64_t i8;

-const int ci1;
-const short ci2;
-const long ci3;
-const long long ci4;
-const int8_t ci5;
-const int16_t ci6;
-const int32_t ci7;
-const int64_t ci8;
-
 int16x8_t intvec;

 void test(void)
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_intrinsic_type_overloads-fp.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_intrinsic_type_overloads-fp.c
index 7492e9b22bd..a2787a47859 100644
--- a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_intrinsic_type_overloads-fp.c
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_intrinsic_type_overloads-fp.c
@@ -19,15 +19,6 @@ int16_t i6;
 int32_t i7;
 int64_t i8;
 
-const int ci1;
-const short ci2;
-const long ci3;
-const long long ci4;
-const int8_t ci5;
-const int16_t ci6;
-const int32_t ci7;
-const int64_t ci8;
-
 float16x8_t floatvec;
 int16x8_t intvec;
 
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_intrinsic_type_overloads-int.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_intrinsic_type_overloads-int.c
index 9a921bf40e8..7b88f462e17 100644
--- a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_intrinsic_type_overloads-int.c
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_intrinsic_type_overloads-int.c
@@ -13,15 +13,6 @@ int16_t i6;
 int32_t i7;
 int64_t i8;
 
-const int ci1;
-const short ci2;
-const long ci3;
-const long long ci4;
-const int8_t ci5;
-const int16_t ci6;
-const int32_t ci7;
-const int64_t ci8;
-
 int16x8_t intvec;
 
 void test(void)


Re: arm: Fix MVE vcreate definition

2023-04-04 Thread Stamatis Markianos-Wright via Gcc-patches



On 29/03/2023 13:16, Kyrylo Tkachov wrote:

-Original Message-
From: Stam Markianos-Wright
Sent: Wednesday, March 29, 2023 11:50 AM
To:gcc-patches@gcc.gnu.org
Cc: Kyrylo Tkachov
Subject: arm: Fix MVE vcreate definition

Hi all,

I just found a bug that goes back to the initial merge of
the MVE backend: The vcreate intrinsic has had it's vector
lanes mixed up, compared to what was intended (as per
the ACLE) definition. This is also a discrepancy with clang:
https://godbolt.org/z/4n93e5aqj

This patches simply switches the operands around and
makes the tests more specific on the input registers
(I do not touch the output Q regs as they vary based
on softfp/hardfp or the input registers when the input
is a constant, since, in that case, a single register
is loaded with a constant and then the same register is
used twice as "vmov q0[2], q0[0], r2, r2" and the reg
num might also not always be guaranteed).

No regressions on MVE tesctsuite configurations or in
the CMSIS-NN testsuite.

Ok for trunk? (Despite this being late in Stage 4, sorry
about that!)

Ok, since this is a wrong-code fix.

Thanks, applied as:
3f0ca7a3e4431534bff3b8eb73709cc822e489b0.

This needs backports as well, right?

Indeed! I'm building up a larger list of commits that we're hoping
to backport, so I will include this on that list.


Thanks,
Kyrill


Thanks,
Stamatis Markianos-Wright

gcc/ChangeLog:

      * config/arm/mve.md (mve_vcvtq_n_to_f_): Swap
operands.
    (mve_vcreateq_f): Swap operands.

gcc/testsuite/ChangeLog:

      * gcc.target/arm/mve/intrinsics/vcreateq_f16.c: Tighten test.
      * gcc.target/arm/mve/intrinsics/vcreateq_f32.c: Tighten test.
      * gcc.target/arm/mve/intrinsics/vcreateq_s16.c: Tighten test.
      * gcc.target/arm/mve/intrinsics/vcreateq_s32.c: Tighten test.
      * gcc.target/arm/mve/intrinsics/vcreateq_s64.c: Tighten test.
      * gcc.target/arm/mve/intrinsics/vcreateq_s8.c: Tighten test.
      * gcc.target/arm/mve/intrinsics/vcreateq_u16.c: Tighten test.
      * gcc.target/arm/mve/intrinsics/vcreateq_u32.c: Tighten test.
      * gcc.target/arm/mve/intrinsics/vcreateq_u64.c: Tighten test.
      * gcc.target/arm/mve/intrinsics/vcreateq_u8.c: Tighten test.


arm: Fix MVE vcreate definition

2023-03-29 Thread Stamatis Markianos-Wright via Gcc-patches

Hi all,

I just found a bug that goes back to the initial merge of
the MVE backend: The vcreate intrinsic has had it's vector
lanes mixed up, compared to what was intended (as per
the ACLE) definition. This is also a discrepancy with clang:
https://godbolt.org/z/4n93e5aqj

This patches simply switches the operands around and
makes the tests more specific on the input registers
(I do not touch the output Q regs as they vary based
on softfp/hardfp or the input registers when the input
is a constant, since, in that case, a single register
is loaded with a constant and then the same register is
used twice as "vmov q0[2], q0[0], r2, r2" and the reg
num might also not always be guaranteed).

No regressions on MVE tesctsuite configurations or in
the CMSIS-NN testsuite.

Ok for trunk? (Despite this being late in Stage 4, sorry
about that!)

Thanks,
Stamatis Markianos-Wright

gcc/ChangeLog:

    * config/arm/mve.md (mve_vcvtq_n_to_f_): Swap operands.
  (mve_vcreateq_f): Swap operands.

gcc/testsuite/ChangeLog:

    * gcc.target/arm/mve/intrinsics/vcreateq_f16.c: Tighten test.
    * gcc.target/arm/mve/intrinsics/vcreateq_f32.c: Tighten test.
    * gcc.target/arm/mve/intrinsics/vcreateq_s16.c: Tighten test.
    * gcc.target/arm/mve/intrinsics/vcreateq_s32.c: Tighten test.
    * gcc.target/arm/mve/intrinsics/vcreateq_s64.c: Tighten test.
    * gcc.target/arm/mve/intrinsics/vcreateq_s8.c: Tighten test.
    * gcc.target/arm/mve/intrinsics/vcreateq_u16.c: Tighten test.
    * gcc.target/arm/mve/intrinsics/vcreateq_u32.c: Tighten test.
    * gcc.target/arm/mve/intrinsics/vcreateq_u64.c: Tighten test.
    * gcc.target/arm/mve/intrinsics/vcreateq_u8.c: Tighten test.
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index d913ca24f8ea8d2fcadea972e037ede6f9cf36f9..a3589b19edd7398f66f8dc51276cf94345ec66a5 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -763,7 +763,7 @@
 	 VCREATEQ_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
-  "vmov %q0[2], %q0[0], %Q2, %Q1\;vmov %q0[3], %q0[1], %R2, %R1"
+  "vmov %q0[2], %q0[0], %Q1, %Q2\;vmov %q0[3], %q0[1], %R1, %R2"
   [(set_attr "type" "mve_move")
(set_attr "length""8")])
 
@@ -778,7 +778,7 @@
 	 VCREATEQ))
   ]
   "TARGET_HAVE_MVE"
-  "vmov %q0[2], %q0[0], %Q2, %Q1\;vmov %q0[3], %q0[1], %R2, %R1"
+  "vmov %q0[2], %q0[0], %Q1, %Q2\;vmov %q0[3], %q0[1], %R1, %R2"
   [(set_attr "type" "mve_move")
(set_attr "length""8")])
 
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vcreateq_f16.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vcreateq_f16.c
index 0458bb1bb7cd6a3f898f3138f86d9c52374ae48d..8d6764d893834bb751ba79476f67ef5111ee1775 100644
--- a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vcreateq_f16.c
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vcreateq_f16.c
@@ -12,8 +12,8 @@ extern "C" {
 /*
 **foo:
 **	...
-**	vmov q[0-9+]\[2\], q[0-9+]\[0\], r[0-9+], r[0-9+]
-**	vmov q[0-9+]\[3\], q[0-9+]\[1\], r[0-9+], r[0-9+]
+**	vmov q[0-9+]\[2\], q[0-9+]\[0\], r0, r2
+**	vmov q[0-9+]\[3\], q[0-9+]\[1\], r1, r3
 **	...
 */
 float16x8_t
@@ -39,4 +39,4 @@ foo1 ()
 }
 #endif
 
-/* { dg-final { scan-assembler-not "__ARM_undef" } } */
\ No newline at end of file
+/* { dg-final { scan-assembler-not "__ARM_undef" } } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vcreateq_f32.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vcreateq_f32.c
index af782b5ac5379f6890af03c3f5ae6ef41492f623..6ab05ced809ec38eb5b72123120a0c822cf3e351 100644
--- a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vcreateq_f32.c
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vcreateq_f32.c
@@ -12,8 +12,8 @@ extern "C" {
 /*
 **foo:
 **	...
-**	vmov q[0-9+]\[2\], q[0-9+]\[0\], r[0-9+], r[0-9+]
-**	vmov q[0-9+]\[3\], q[0-9+]\[1\], r[0-9+], r[0-9+]
+**	vmov q[0-9+]\[2\], q[0-9+]\[0\], r0, r2
+**	vmov q[0-9+]\[3\], q[0-9+]\[1\], r1, r3
 **	...
 */
 float32x4_t
@@ -39,4 +39,4 @@ foo1 ()
 }
 #endif
 
-/* { dg-final { scan-assembler-not "__ARM_undef" } } */
\ No newline at end of file
+/* { dg-final { scan-assembler-not "__ARM_undef" } } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vcreateq_s16.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vcreateq_s16.c
index 8a3e91843f8cdece415d685b13710e4d250d8da0..290637595a4a26c019abcb6e85f1741d72ade93f 100644
--- a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vcreateq_s16.c
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vcreateq_s16.c
@@ -12,8 +12,8 @@ extern "C" {
 /*
 **foo:
 **	...
-**	vmov q[0-9+]\[2\], q[0-9+]\[0\], r[0-9+], r[0-9+]
-**	vmov q[0-9+]\[3\], q[0-9+]\[1\], r[0-9+], r[0-9+]
+**	vmov q[0-9+]\[2\], q[0-9+]\[0\], r0, r2
+**	vmov q[0-9+]\[3\], q[0-9+]\[1\], r1, r3
 **	...
 */
 int16x8_t
@@ -39,4 +39,4 @@ foo1 ()
 }
 #endif
 
-/* { dg-final { scan-assembler-not "__ARM_undef" } } */
\ No newline at end of file
+/* { dg-final { scan-assembler-not "__ARM_undef" } } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vcreateq_s32.c