Ping: [PATCH][ARM] MVE: Implementing auto-vectorized array * scalar instructions

2023-04-27 Thread Victor L. Do Nascimento via Gcc-patches
May I please ping this one??

https://gcc.gnu.org/pipermail/gcc-patches/2023-February/612152.html

Many Thanks!

Victor

On 2/16/23 15:48, Victor L. Do Nascimento wrote:
> Hi all,
> 
> The back-end pattern for mapping the auto-vectorized representation of
> vector * scalar to to machine instruction VMUL was missing, and
> multiple instructions were needed to reproduce this behavior as a
> result of failed RTL pattern match in combine pass.
> 
> RTL patterns were introduced to reproduce the behavior of the
> intrinsics vmulq_n_ and vmulq_n_f.
> 
> In the case of literal constants, an intermediate instruction was
> added in to initial RTL expansion to ensure a general-purpose register
> was allocated to store the constant, which could then be be extracted
> from the constant vector.
> 
> For the function
> 
> void test_vmulimm_s32x4 (int32_t * __restrict__ dest, int32_t *a)
> {
>int i;
>for (i=0; i<4; i++) {
>  dest[i] = a[i] * 5;
>}
> }
> 
> 
> The GIMPLE -> RTL expansion is modified to produce:
> (set (reg:SI 119)
>   (const_int 5 [0x5]))
> (set (reg:V4SI 118)
>   (mult:V4SI (vec_duplicate:V4SI (reg:SI 119))
>  (reg:V4SI 117)))
> 
> instead of:
> (set (reg:V4SI 119)
>   (const_vector:V4SI [
>  (const_int 5 [0x5]) repeated x4
>]))
> (set (reg:V4SI 118)
>   (mult:V4SI (reg:V4SI 117)
>  (reg:V4SI 119)))
> 
> The end assembly for the above function introduces the emission of the 
> following insn:
> vmul.i32 q3, q3, r3
> 
> as opposed to:
> vmul.i32 q3, q3, q2
> 
> All tests in gcc.target/arm/simd/mve-vmul-scalar-1.c now pass.
> 
> Added new RTL templates, amended unit test and checked for regressions on 
> arm-none-eabi.
> 
> Thanks,
> Victor
> 
> gcc:
>   * gcc/config/arm/arm.cc (neon_vdup_constant): static keyword
>   removed.
>   * gcc/config/arm/arm-protos.h (neon_vdup_constant): prototype
>   added.
>   * gcc/config/arm/mve.md (@mve_vmulq_n_2): New.
>   * gcc/config/arm/predicates.md (reg_or_me_replicated_const_operand):
>   New.
>   * gcc/config/arm/vec-common.md (mul3): Modify to use
>   `reg_or_me_replicated_const_operand'.
> 
> testsuite:
>   * gcc.target/arm/simd/mve-vmul-scalar-1.c: Corrected typo,
>   xfails removed.
> ---
>   gcc/config/arm/arm-protos.h|  1 +
>   gcc/config/arm/arm.cc  |  2 +-
>   gcc/config/arm/mve.md  | 11 +++
>   gcc/config/arm/predicates.md   |  8 
>   gcc/config/arm/vec-common.md   | 14 --
>   .../gcc.target/arm/simd/mve-vmul-scalar-1.c| 13 ++---
>   6 files changed, 39 insertions(+), 10 deletions(-)
> 
> diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
> index aea472bfbb9..4cf9fb00e01 100644
> --- a/gcc/config/arm/arm-protos.h
> +++ b/gcc/config/arm/arm-protos.h
> @@ -199,6 +199,7 @@ extern rtx arm_load_tp (rtx);
>   extern bool arm_coproc_builtin_available (enum unspecv);
>   extern bool arm_coproc_ldc_stc_legitimate_address (rtx);
>   extern rtx arm_stack_protect_tls_canary_mem (bool);
> +extern rtx neon_vdup_constant (rtx, bool);
>   
>   
>   #if defined TREE_CODE
> diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc
> index efc48349dd3..7d9d265b0a7 100644
> --- a/gcc/config/arm/arm.cc
> +++ b/gcc/config/arm/arm.cc
> @@ -13301,7 +13301,7 @@ neon_pairwise_reduce (rtx op0, rtx op1, machine_mode 
> mode,
>  If this is the case, and GENERATE is set, we also generate
>  instructions to do this and return an RTX to assign to the register.  */
>   
> -static rtx
> +rtx
>   neon_vdup_constant (rtx vals, bool generate)
>   {
> machine_mode mode = GET_MODE (vals);
> diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
> index 555ad1b66c8..806c24e33aa 100644
> --- a/gcc/config/arm/mve.md
> +++ b/gcc/config/arm/mve.md
> @@ -1376,6 +1376,17 @@
> [(set_attr "type" "mve_move")
>   ])
>   
> +(define_insn "@mve_vmulq_n_2"
> +  [
> +   (set (match_operand:MVE_VLD_ST 0 "s_register_operand" "=w")
> + (mult:MVE_VLD_ST (vec_duplicate:MVE_VLD_ST (match_operand: 1 
> "s_register_operand" "r"))
> +(match_operand:MVE_VLD_ST 2 
> "s_register_operand" "w")))
> +  ]
> +  "TARGET_HAVE_MVE"
> +  "vmul.%#\t%q0, %q2, %r1"
> +  [(set_attr "type" "mve_move")
> +])
> +
>   ;;
>   ;; [vmulq_u, vmulq_s])
>   ;;
> diff --git a/gcc/config/arm/predicates.md b/gcc/config/arm/predicates.md
> index 3139750c606..31eadfa2d3b 100644
> --- a/gcc/config/arm/predicates.md
> +++ b/gcc/config/arm/predicates.md
> @@ -113,6 +113,14 @@
> && neon_immediate_valid_for_logic (op, mode, 1, NULL, NULL));
>   })
>   
> +(define_predicate "reg_or_mve_replicated_const_operand"
> +  (if_then_else (and (match_test "TARGET_HAVE_MVE")
> +  (match_code "const_vector")
> +  (match_test "const_vec_duplicate_p 

[PATCH][ARM] MVE: Implementing auto-vectorized array * scalar instructions

2023-02-16 Thread Victor L. Do Nascimento via Gcc-patches
Hi all,

The back-end pattern for mapping the auto-vectorized representation of
vector * scalar to to machine instruction VMUL was missing, and
multiple instructions were needed to reproduce this behavior as a
result of failed RTL pattern match in combine pass.

RTL patterns were introduced to reproduce the behavior of the
intrinsics vmulq_n_ and vmulq_n_f.

In the case of literal constants, an intermediate instruction was
added in to initial RTL expansion to ensure a general-purpose register
was allocated to store the constant, which could then be be extracted
from the constant vector.

For the function

void test_vmulimm_s32x4 (int32_t * __restrict__ dest, int32_t *a) 
{ 
  int i;
  for (i=0; i<4; i++) { 
dest[i] = a[i] * 5; 
  }
}


The GIMPLE -> RTL expansion is modified to produce:
(set (reg:SI 119)
 (const_int 5 [0x5]))
(set (reg:V4SI 118)
 (mult:V4SI (vec_duplicate:V4SI (reg:SI 119))
(reg:V4SI 117)))

instead of:
(set (reg:V4SI 119)
 (const_vector:V4SI [
(const_int 5 [0x5]) repeated x4
  ]))
(set (reg:V4SI 118)
 (mult:V4SI (reg:V4SI 117)
(reg:V4SI 119)))

The end assembly for the above function introduces the emission of the 
following insn:
vmul.i32 q3, q3, r3

as opposed to:
vmul.i32 q3, q3, q2

All tests in gcc.target/arm/simd/mve-vmul-scalar-1.c now pass.

Added new RTL templates, amended unit test and checked for regressions on 
arm-none-eabi.

Thanks,
Victor

gcc:
* gcc/config/arm/arm.cc (neon_vdup_constant): static keyword
removed.
* gcc/config/arm/arm-protos.h (neon_vdup_constant): prototype
added.
* gcc/config/arm/mve.md (@mve_vmulq_n_2): New.
* gcc/config/arm/predicates.md (reg_or_me_replicated_const_operand):
New.
* gcc/config/arm/vec-common.md (mul3): Modify to use
`reg_or_me_replicated_const_operand'.

testsuite:
* gcc.target/arm/simd/mve-vmul-scalar-1.c: Corrected typo,
xfails removed. 
---
 gcc/config/arm/arm-protos.h|  1 +
 gcc/config/arm/arm.cc  |  2 +-
 gcc/config/arm/mve.md  | 11 +++
 gcc/config/arm/predicates.md   |  8 
 gcc/config/arm/vec-common.md   | 14 --
 .../gcc.target/arm/simd/mve-vmul-scalar-1.c| 13 ++---
 6 files changed, 39 insertions(+), 10 deletions(-)

diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index aea472bfbb9..4cf9fb00e01 100644
--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -199,6 +199,7 @@ extern rtx arm_load_tp (rtx);
 extern bool arm_coproc_builtin_available (enum unspecv);
 extern bool arm_coproc_ldc_stc_legitimate_address (rtx);
 extern rtx arm_stack_protect_tls_canary_mem (bool);
+extern rtx neon_vdup_constant (rtx, bool);
 
 
 #if defined TREE_CODE
diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc
index efc48349dd3..7d9d265b0a7 100644
--- a/gcc/config/arm/arm.cc
+++ b/gcc/config/arm/arm.cc
@@ -13301,7 +13301,7 @@ neon_pairwise_reduce (rtx op0, rtx op1, machine_mode 
mode,
If this is the case, and GENERATE is set, we also generate
instructions to do this and return an RTX to assign to the register.  */
 
-static rtx
+rtx
 neon_vdup_constant (rtx vals, bool generate)
 {
   machine_mode mode = GET_MODE (vals);
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index 555ad1b66c8..806c24e33aa 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -1376,6 +1376,17 @@
   [(set_attr "type" "mve_move")
 ])
 
+(define_insn "@mve_vmulq_n_2"
+  [
+   (set (match_operand:MVE_VLD_ST 0 "s_register_operand" "=w")
+   (mult:MVE_VLD_ST (vec_duplicate:MVE_VLD_ST (match_operand: 1 
"s_register_operand" "r"))
+  (match_operand:MVE_VLD_ST 2 
"s_register_operand" "w")))
+  ]
+  "TARGET_HAVE_MVE"
+  "vmul.%#\t%q0, %q2, %r1"
+  [(set_attr "type" "mve_move")
+])
+
 ;;
 ;; [vmulq_u, vmulq_s])
 ;;
diff --git a/gcc/config/arm/predicates.md b/gcc/config/arm/predicates.md
index 3139750c606..31eadfa2d3b 100644
--- a/gcc/config/arm/predicates.md
+++ b/gcc/config/arm/predicates.md
@@ -113,6 +113,14 @@
   && neon_immediate_valid_for_logic (op, mode, 1, NULL, NULL));
 })
 
+(define_predicate "reg_or_mve_replicated_const_operand"
+  (if_then_else (and (match_test "TARGET_HAVE_MVE")
+(match_code "const_vector")
+(match_test "const_vec_duplicate_p (op)"))
+   (match_operand 0 "immediate_operand")
+   (match_operand 0 "s_register_operand"))
+)
+
 (define_predicate "neon_inv_logic_op2"
   (ior (match_operand 0 "imm_for_neon_inv_logic_operand")
(match_operand 0 "s_register_operand")))
diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
index f06df4db636..17b67c214b4 100644
--- a/gcc/config/arm/vec-common.md
+++ b/gcc/config/arm/vec-common.md
@@