Hi All,

This patch adds extended costing to cost the creation of constants and the
manipulation of constants.  The default values provided are based on
architectural expectations and each cost models can be individually tweaked as
needed.

The changes in this patch covers:

* Construction of PARALLEL or CONST_VECTOR:
  Adds better costing for vector of constants which is based on the constant
  being created and the instruction that can be used to create it.  i.e. a movi
  is cheaper than a literal load etc.
* Construction of a vector through a vec_dup.
* Extraction of part of a vector using a vec_select.  In this part we had to
  make some opportunistic assumptions.  In particular we had to model extracting
  of the high-half of a register as being "free" in order to get fusion using
  NEON high-part instructions possible.  In the event that there is no <insn>2
  variant for the instruction the select would still be cheaper than the load.

Unfortunately on AArch64 you need -O3 when using intrinsics for this to kick
in until we fix vld1/2/3 to be gimple instead of RTL intrinsics.

This should also fix the stack allocations.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

        * config/arm/aarch-common-protos.h (struct vector_cost_table): Add
        movi, dup and extract costing fields.
        * config/aarch64/aarch64-cost-tables.h (qdf24xx_extra_costs,
        thunderx_extra_costs, thunderx2t99_extra_costs,
        thunderx3t110_extra_costs, tsv110_extra_costs, a64fx_extra_costs): Use
        them.
        * config/arm/aarch-cost-tables.h (generic_extra_costs,
        cortexa53_extra_costs, cortexa57_extra_costs, cortexa76_extra_costs,
        exynosm1_extra_costs, xgene1_extra_costs): Likewise
        * config/aarch64/aarch64-simd.md (aarch64_simd_dup<mode>): Add r->w dup.
        * config/aarch64/aarch64.c (aarch64_simd_make_constant): Expose.
        (aarch64_rtx_costs): Add extra costs.
        (aarch64_simd_dup_constant): Support check only mode.

gcc/testsuite/ChangeLog:

        * gcc.target/aarch64/vect-cse-codegen.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/config/aarch64/aarch64-cost-tables.h 
b/gcc/config/aarch64/aarch64-cost-tables.h
index 
dd2e7e7cbb13d24f0b51092270cd7e2d75fabf29..bb499a1eae62a145f1665d521f57c98b49ac5389
 100644
--- a/gcc/config/aarch64/aarch64-cost-tables.h
+++ b/gcc/config/aarch64/aarch64-cost-tables.h
@@ -124,7 +124,10 @@ const struct cpu_cost_table qdf24xx_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -229,7 +232,10 @@ const struct cpu_cost_table thunderx_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1), /* Alu.  */
-    COSTS_N_INSNS (4)  /* mult.  */
+    COSTS_N_INSNS (4), /* mult.  */
+    COSTS_N_INSNS (1), /* movi.  */
+    COSTS_N_INSNS (2), /* dup.  */
+    COSTS_N_INSNS (2)  /* extract.  */
   }
 };
 
@@ -333,7 +339,10 @@ const struct cpu_cost_table thunderx2t99_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1), /* Alu.  */
-    COSTS_N_INSNS (4)  /* Mult.  */
+    COSTS_N_INSNS (4), /* Mult.  */
+    COSTS_N_INSNS (1), /* movi.  */
+    COSTS_N_INSNS (2), /* dup.  */
+    COSTS_N_INSNS (2)  /* extract.  */
   }
 };
 
@@ -437,7 +446,10 @@ const struct cpu_cost_table thunderx3t110_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1), /* Alu.  */
-    COSTS_N_INSNS (4)  /* Mult.  */
+    COSTS_N_INSNS (4), /* Mult.  */
+    COSTS_N_INSNS (1), /* movi.  */
+    COSTS_N_INSNS (2), /* dup.  */
+    COSTS_N_INSNS (2)  /* extract.  */
   }
 };
 
@@ -542,7 +554,10 @@ const struct cpu_cost_table tsv110_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -646,7 +661,10 @@ const struct cpu_cost_table a64fx_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 
c5638d096fa84a27b4ea397f62cd0d05a28e7c8c..6814dae079c9ff40aaa2bb625432bf9eb8906b73
 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -74,12 +74,14 @@ (define_insn "aarch64_simd_dup<mode>"
 )
 
 (define_insn "aarch64_simd_dup<mode>"
-  [(set (match_operand:VDQF_F16 0 "register_operand" "=w")
+  [(set (match_operand:VDQF_F16 0 "register_operand" "=w,w")
        (vec_duplicate:VDQF_F16
-         (match_operand:<VEL> 1 "register_operand" "w")))]
+         (match_operand:<VEL> 1 "register_operand" "w,r")))]
   "TARGET_SIMD"
-  "dup\\t%0.<Vtype>, %1.<Vetype>[0]"
-  [(set_attr "type" "neon_dup<q>")]
+  "@
+   dup\\t%0.<Vtype>, %1.<Vetype>[0]
+   dup\\t%0.<Vtype>, %<vw>1"
+  [(set_attr "type" "neon_dup<q>, neon_from_gp<q>")]
 )
 
 (define_insn "aarch64_dup_lane<mode>"
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 
f80de2ca8971086d6a4bf3aa7793d0cda953b5c8..26d78ffe98a3445dcc490c93849c46a8c2595cf8
 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -302,6 +302,7 @@ static machine_mode aarch64_simd_container_mode 
(scalar_mode, poly_int64);
 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
                                            aarch64_addr_query_type);
 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
+static rtx aarch64_simd_make_constant (rtx, bool);
 
 /* Major revision number of the ARM Architecture implemented by the target.  */
 unsigned aarch64_architecture_version;
@@ -12665,7 +12666,7 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int outer 
ATTRIBUTE_UNUSED,
   rtx op0, op1, op2;
   const struct cpu_cost_table *extra_cost
     = aarch64_tune_params.insn_extra_cost;
-  int code = GET_CODE (x);
+  rtx_code code = GET_CODE (x);
   scalar_int_mode int_mode;
 
   /* By default, assume that everything has equivalent cost to the
@@ -13936,8 +13937,65 @@ cost_plus:
                             mode, MULT, 1, speed);
           return true;
         }
+       break;
+    case PARALLEL:
+      /* Fall through */
+    case CONST_VECTOR:
+       {
+         rtx gen_insn = aarch64_simd_make_constant (x, true);
+         /* Not a valid const vector.  */
+         if (!gen_insn)
+           break;
 
-      /* Fall through.  */
+         switch (GET_CODE (gen_insn))
+         {
+         case CONST_VECTOR:
+           /* Load using MOVI/MVNI.  */
+           if (aarch64_simd_valid_immediate (x, NULL))
+             *cost += extra_cost->vect.movi;
+           else /* Load using constant pool.  */
+             *cost += extra_cost->ldst.load;
+           break;
+         /* Load using a DUP.  */
+         case VEC_DUPLICATE:
+           *cost += extra_cost->vect.dup;
+           break;
+         default:
+           *cost += extra_cost->ldst.load;
+           break;
+         }
+         return true;
+       }
+    case VEC_CONCAT:
+       /* depending on the operation, either DUP or INS.
+          For now, keep default costing.  */
+       break;
+    case VEC_DUPLICATE:
+       *cost += extra_cost->vect.dup;
+       return true;
+    case VEC_SELECT:
+       {
+         /* cost subreg of 0 as free, otherwise as DUP */
+         rtx op1 = XEXP (x, 1);
+         int nelts;
+         if ((op1 == const0_rtx && !BYTES_BIG_ENDIAN)
+             || (BYTES_BIG_ENDIAN
+                 && GET_MODE_NUNITS (mode).is_constant(&nelts)
+                 && INTVAL (op1) == nelts - 1))
+           ;
+         else if (vec_series_lowpart_p (mode, GET_MODE (op1), op1))
+           ;
+         else if (vec_series_highpart_p (mode, GET_MODE (op1), op1))
+         /* Selecting the high part is not technically free, but we lack
+            enough information to decide that here.  For instance selecting
+            the high-part of a vec_dup *is* free or to feed into any _high
+            instruction.   Both of which we can't really tell.  That said
+            have a better chance to optimize an dup vs multiple constants.  */
+           ;
+         else
+           *cost += extra_cost->vect.extract;
+         return true;
+       }
     default:
       break;
     }
@@ -20663,9 +20721,12 @@ aarch64_builtin_support_vector_misalignment 
(machine_mode mode,
 
 /* If VALS is a vector constant that can be loaded into a register
    using DUP, generate instructions to do so and return an RTX to
-   assign to the register.  Otherwise return NULL_RTX.  */
+   assign to the register.  Otherwise return NULL_RTX.
+
+   If CHECK then the resulting instruction may not be used in
+   codegen but can be used for costing.  */
 static rtx
-aarch64_simd_dup_constant (rtx vals)
+aarch64_simd_dup_constant (rtx vals, bool check = false)
 {
   machine_mode mode = GET_MODE (vals);
   machine_mode inner_mode = GET_MODE_INNER (mode);
@@ -20677,7 +20738,8 @@ aarch64_simd_dup_constant (rtx vals)
   /* We can load this constant by using DUP and a constant in a
      single ARM register.  This will be cheaper than a vector
      load.  */
-  x = copy_to_mode_reg (inner_mode, x);
+  if (!check)
+    x = copy_to_mode_reg (inner_mode, x);
   return gen_vec_duplicate (mode, x);
 }
 
@@ -20685,9 +20747,12 @@ aarch64_simd_dup_constant (rtx vals)
 /* Generate code to load VALS, which is a PARALLEL containing only
    constants (for vec_init) or CONST_VECTOR, efficiently into a
    register.  Returns an RTX to copy into the register, or NULL_RTX
-   for a PARALLEL that cannot be converted into a CONST_VECTOR.  */
+   for a PARALLEL that cannot be converted into a CONST_VECTOR.
+
+   If CHECK then the resulting instruction may not be used in
+   codegen but can be used for costing.  */
 static rtx
-aarch64_simd_make_constant (rtx vals)
+aarch64_simd_make_constant (rtx vals, bool check = false)
 {
   machine_mode mode = GET_MODE (vals);
   rtx const_dup;
@@ -20719,7 +20784,7 @@ aarch64_simd_make_constant (rtx vals)
       && aarch64_simd_valid_immediate (const_vec, NULL))
     /* Load using MOVI/MVNI.  */
     return const_vec;
-  else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
+  else if ((const_dup = aarch64_simd_dup_constant (vals, check)) != NULL_RTX)
     /* Loaded using DUP.  */
     return const_dup;
   else if (const_vec != NULL_RTX)
diff --git a/gcc/config/arm/aarch-common-protos.h 
b/gcc/config/arm/aarch-common-protos.h
index 
6be5fb1e083d7ff130386dfa181b9a0c8fd5437c..55a470d8e1410bdbcfbea084ec11b468485c1400
 100644
--- a/gcc/config/arm/aarch-common-protos.h
+++ b/gcc/config/arm/aarch-common-protos.h
@@ -133,6 +133,9 @@ struct vector_cost_table
 {
   const int alu;
   const int mult;
+  const int movi;
+  const int dup;
+  const int extract;
 };
 
 struct cpu_cost_table
diff --git a/gcc/config/arm/aarch-cost-tables.h 
b/gcc/config/arm/aarch-cost-tables.h
index 
25ff702f01fab50d749b9a7b7b072c2be2504562..0e6a62665c7e18debc382a294a37945188fb90ef
 100644
--- a/gcc/config/arm/aarch-cost-tables.h
+++ b/gcc/config/arm/aarch-cost-tables.h
@@ -122,7 +122,10 @@ const struct cpu_cost_table generic_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1), /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -226,7 +229,10 @@ const struct cpu_cost_table cortexa53_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1), /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -330,7 +336,10 @@ const struct cpu_cost_table cortexa57_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -434,7 +443,10 @@ const struct cpu_cost_table cortexa76_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -538,7 +550,10 @@ const struct cpu_cost_table exynosm1_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (0),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -642,7 +657,10 @@ const struct cpu_cost_table xgene1_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (2),  /* alu.  */
-    COSTS_N_INSNS (8)   /* mult.  */
+    COSTS_N_INSNS (8),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c 
b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c
new file mode 100644
index 
0000000000000000000000000000000000000000..36e468aacfadd7701c6a7cd432bee81472111a16
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c
@@ -0,0 +1,127 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -march=armv8.2-a+crypto -fno-schedule-insns 
-fno-schedule-insns2 -mcmodel=small" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <arm_neon.h>
+
+/*
+**test0:
+**     movi    v2.16b, 0x3
+**     ldr     q0, \[x0\]
+**     uxtl    v1.8h, v0.8b
+**     uxtl2   v0.8h, v0.16b
+**     ldr     q3, \[x1\]
+**     umlal   v1.8h, v3.8b, v2.8b
+**     umlal2  v0.8h, v3.16b, v2.16b
+**     addhn   v0.8b, v1.8h, v0.8h
+**     str     d0, \[x2\]
+**     ret
+*/
+
+void test0 (uint8_t *inptr0, uint8_t *inptr1, uint8_t *outptr0)
+{
+  uint8x16_t three_u8 = vdupq_n_u8(3);
+  uint8x16_t x = vld1q_u8(inptr0);
+  uint8x16_t y = vld1q_u8(inptr1);
+  uint16x8_t x_l = vmovl_u8(vget_low_u8(x));
+  uint16x8_t x_h = vmovl_u8(vget_high_u8(x));
+  uint16x8_t z_l = vmlal_u8(x_l, vget_low_u8(y), vget_low_u8(three_u8));
+  uint16x8_t z_h = vmlal_u8(x_h, vget_high_u8(y), vget_high_u8(three_u8));
+  vst1_u8(outptr0, vaddhn_u16(z_l, z_h));
+}
+
+/*
+**test1:
+**     sub     sp, sp, #16
+**     adrp    x2, .LC0
+**     ldr     q1, \[x2, #:lo12:.LC0\]
+**     add     v0.2d, v1.2d, v0.2d
+**     str     q0, \[x1\]
+**     fmov    x1, d1
+**     orr     x0, x0, x1
+**     add     sp, sp, 16
+**     ret
+*/
+
+uint64_t
+test1 (uint64_t a, uint64x2_t b, uint64x2_t* rt)
+{
+  uint64_t arr[2] = { 0x0942430810234076UL, 0x0942430810234076UL};
+  uint64_t res = a | arr[0];
+  uint64x2_t val = vld1q_u64 (arr);
+  *rt = vaddq_u64 (val, b);
+  return res;
+}
+
+/*
+**test2:
+**     adrp    x2, .LC1
+**     ldr     q1, \[x2, #:lo12:.LC1\]
+**     add     v0.2d, v0.2d, v1.2d
+**     str     q0, \[x1\]
+**     fmov    x1, d1
+**     orr     x0, x0, x1
+**     ret
+*/
+
+uint64_t
+test2 (uint64_t a, uint64x2_t b, uint64x2_t* rt)
+{
+  uint64x2_t val = vdupq_n_u64 (0x0424303242234076UL);
+  uint64_t arr = vgetq_lane_u64 (val, 0);
+  uint64_t res = a | arr;
+  *rt = vaddq_u64 (val, b);
+  return res;
+}
+
+/*
+**test3:
+**     sub     sp, sp, #16
+**     adrp    x2, .LC2
+**     ldr     q1, \[x2, #:lo12:.LC2\]
+**     add     v0.4s, v1.4s, v0.4s
+**     str     q0, \[x1\]
+**     fmov    w1, s1
+**     orr     w0, w0, w1
+**     add     sp, sp, 16
+**     ret
+*/
+
+uint32_t
+test3 (uint32_t a, uint32x4_t b, uint32x4_t* rt)
+{
+  uint32_t arr[4] = { 0x094243, 0x094243, 0x094243, 0x094243 };
+  uint32_t res = a | arr[0];
+  uint32x4_t val = vld1q_u32 (arr);
+  *rt = vaddq_u32 (val, b);
+  return res;
+}
+
+/*
+**test4:
+**     ushr    v0.16b, v0.16b, 7
+**     mov     x0, 16512
+**     movk    x0, 0x1020, lsl 16
+**     movk    x0, 0x408, lsl 32
+**     movk    x0, 0x102, lsl 48
+**     fmov    d1, x0
+**     pmull   v2.1q, v0.1d, v1.1d
+**     dup     v1.2d, v1.d\[0\]
+**     pmull2  v0.1q, v0.2d, v1.2d
+**     trn2    v2.8b, v2.8b, v0.8b
+**     umov    w0, v2.h\[3\]
+**     ret
+*/
+
+uint64_t
+test4 (uint8x16_t input)
+{
+    uint8x16_t bool_input = vshrq_n_u8(input, 7);
+    poly64x2_t mask = vdupq_n_p64(0x0102040810204080UL);
+    poly64_t prodL = 
vmull_p64((poly64_t)vgetq_lane_p64((poly64x2_t)bool_input, 0),
+                               vgetq_lane_p64(mask, 0));
+    poly64_t prodH = vmull_high_p64((poly64x2_t)bool_input, mask);
+    uint8x8_t res = vtrn2_u8((uint8x8_t)prodL, (uint8x8_t)prodH);
+    return vget_lane_u16((uint16x4_t)res, 3);
+}
+


-- 
diff --git a/gcc/config/aarch64/aarch64-cost-tables.h b/gcc/config/aarch64/aarch64-cost-tables.h
index dd2e7e7cbb13d24f0b51092270cd7e2d75fabf29..bb499a1eae62a145f1665d521f57c98b49ac5389 100644
--- a/gcc/config/aarch64/aarch64-cost-tables.h
+++ b/gcc/config/aarch64/aarch64-cost-tables.h
@@ -124,7 +124,10 @@ const struct cpu_cost_table qdf24xx_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -229,7 +232,10 @@ const struct cpu_cost_table thunderx_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),	/* Alu.  */
-    COSTS_N_INSNS (4)	/* mult.  */
+    COSTS_N_INSNS (4),	/* mult.  */
+    COSTS_N_INSNS (1),	/* movi.  */
+    COSTS_N_INSNS (2),	/* dup.  */
+    COSTS_N_INSNS (2)	/* extract.  */
   }
 };
 
@@ -333,7 +339,10 @@ const struct cpu_cost_table thunderx2t99_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),	/* Alu.  */
-    COSTS_N_INSNS (4)	/* Mult.  */
+    COSTS_N_INSNS (4),	/* Mult.  */
+    COSTS_N_INSNS (1),	/* movi.  */
+    COSTS_N_INSNS (2),	/* dup.  */
+    COSTS_N_INSNS (2)	/* extract.  */
   }
 };
 
@@ -437,7 +446,10 @@ const struct cpu_cost_table thunderx3t110_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),	/* Alu.  */
-    COSTS_N_INSNS (4)	/* Mult.  */
+    COSTS_N_INSNS (4),	/* Mult.  */
+    COSTS_N_INSNS (1),	/* movi.  */
+    COSTS_N_INSNS (2),	/* dup.  */
+    COSTS_N_INSNS (2)	/* extract.  */
   }
 };
 
@@ -542,7 +554,10 @@ const struct cpu_cost_table tsv110_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -646,7 +661,10 @@ const struct cpu_cost_table a64fx_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index c5638d096fa84a27b4ea397f62cd0d05a28e7c8c..6814dae079c9ff40aaa2bb625432bf9eb8906b73 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -74,12 +74,14 @@ (define_insn "aarch64_simd_dup<mode>"
 )
 
 (define_insn "aarch64_simd_dup<mode>"
-  [(set (match_operand:VDQF_F16 0 "register_operand" "=w")
+  [(set (match_operand:VDQF_F16 0 "register_operand" "=w,w")
 	(vec_duplicate:VDQF_F16
-	  (match_operand:<VEL> 1 "register_operand" "w")))]
+	  (match_operand:<VEL> 1 "register_operand" "w,r")))]
   "TARGET_SIMD"
-  "dup\\t%0.<Vtype>, %1.<Vetype>[0]"
-  [(set_attr "type" "neon_dup<q>")]
+  "@
+   dup\\t%0.<Vtype>, %1.<Vetype>[0]
+   dup\\t%0.<Vtype>, %<vw>1"
+  [(set_attr "type" "neon_dup<q>, neon_from_gp<q>")]
 )
 
 (define_insn "aarch64_dup_lane<mode>"
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index f80de2ca8971086d6a4bf3aa7793d0cda953b5c8..26d78ffe98a3445dcc490c93849c46a8c2595cf8 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -302,6 +302,7 @@ static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 					    aarch64_addr_query_type);
 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
+static rtx aarch64_simd_make_constant (rtx, bool);
 
 /* Major revision number of the ARM Architecture implemented by the target.  */
 unsigned aarch64_architecture_version;
@@ -12665,7 +12666,7 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
   rtx op0, op1, op2;
   const struct cpu_cost_table *extra_cost
     = aarch64_tune_params.insn_extra_cost;
-  int code = GET_CODE (x);
+  rtx_code code = GET_CODE (x);
   scalar_int_mode int_mode;
 
   /* By default, assume that everything has equivalent cost to the
@@ -13936,8 +13937,65 @@ cost_plus:
 			     mode, MULT, 1, speed);
           return true;
         }
+	break;
+    case PARALLEL:
+      /* Fall through */
+    case CONST_VECTOR:
+	{
+	  rtx gen_insn = aarch64_simd_make_constant (x, true);
+	  /* Not a valid const vector.  */
+	  if (!gen_insn)
+	    break;
 
-      /* Fall through.  */
+	  switch (GET_CODE (gen_insn))
+	  {
+	  case CONST_VECTOR:
+	    /* Load using MOVI/MVNI.  */
+	    if (aarch64_simd_valid_immediate (x, NULL))
+	      *cost += extra_cost->vect.movi;
+	    else /* Load using constant pool.  */
+	      *cost += extra_cost->ldst.load;
+	    break;
+	  /* Load using a DUP.  */
+	  case VEC_DUPLICATE:
+	    *cost += extra_cost->vect.dup;
+	    break;
+	  default:
+	    *cost += extra_cost->ldst.load;
+	    break;
+	  }
+	  return true;
+	}
+    case VEC_CONCAT:
+	/* depending on the operation, either DUP or INS.
+	   For now, keep default costing.  */
+	break;
+    case VEC_DUPLICATE:
+	*cost += extra_cost->vect.dup;
+	return true;
+    case VEC_SELECT:
+	{
+	  /* cost subreg of 0 as free, otherwise as DUP */
+	  rtx op1 = XEXP (x, 1);
+	  int nelts;
+	  if ((op1 == const0_rtx && !BYTES_BIG_ENDIAN)
+	      || (BYTES_BIG_ENDIAN
+		  && GET_MODE_NUNITS (mode).is_constant(&nelts)
+		  && INTVAL (op1) == nelts - 1))
+	    ;
+	  else if (vec_series_lowpart_p (mode, GET_MODE (op1), op1))
+	    ;
+	  else if (vec_series_highpart_p (mode, GET_MODE (op1), op1))
+	  /* Selecting the high part is not technically free, but we lack
+	     enough information to decide that here.  For instance selecting
+	     the high-part of a vec_dup *is* free or to feed into any _high
+	     instruction.   Both of which we can't really tell.  That said
+	     have a better chance to optimize an dup vs multiple constants.  */
+	    ;
+	  else
+	    *cost += extra_cost->vect.extract;
+	  return true;
+	}
     default:
       break;
     }
@@ -20663,9 +20721,12 @@ aarch64_builtin_support_vector_misalignment (machine_mode mode,
 
 /* If VALS is a vector constant that can be loaded into a register
    using DUP, generate instructions to do so and return an RTX to
-   assign to the register.  Otherwise return NULL_RTX.  */
+   assign to the register.  Otherwise return NULL_RTX.
+
+   If CHECK then the resulting instruction may not be used in
+   codegen but can be used for costing.  */
 static rtx
-aarch64_simd_dup_constant (rtx vals)
+aarch64_simd_dup_constant (rtx vals, bool check = false)
 {
   machine_mode mode = GET_MODE (vals);
   machine_mode inner_mode = GET_MODE_INNER (mode);
@@ -20677,7 +20738,8 @@ aarch64_simd_dup_constant (rtx vals)
   /* We can load this constant by using DUP and a constant in a
      single ARM register.  This will be cheaper than a vector
      load.  */
-  x = copy_to_mode_reg (inner_mode, x);
+  if (!check)
+    x = copy_to_mode_reg (inner_mode, x);
   return gen_vec_duplicate (mode, x);
 }
 
@@ -20685,9 +20747,12 @@ aarch64_simd_dup_constant (rtx vals)
 /* Generate code to load VALS, which is a PARALLEL containing only
    constants (for vec_init) or CONST_VECTOR, efficiently into a
    register.  Returns an RTX to copy into the register, or NULL_RTX
-   for a PARALLEL that cannot be converted into a CONST_VECTOR.  */
+   for a PARALLEL that cannot be converted into a CONST_VECTOR.
+
+   If CHECK then the resulting instruction may not be used in
+   codegen but can be used for costing.  */
 static rtx
-aarch64_simd_make_constant (rtx vals)
+aarch64_simd_make_constant (rtx vals, bool check = false)
 {
   machine_mode mode = GET_MODE (vals);
   rtx const_dup;
@@ -20719,7 +20784,7 @@ aarch64_simd_make_constant (rtx vals)
       && aarch64_simd_valid_immediate (const_vec, NULL))
     /* Load using MOVI/MVNI.  */
     return const_vec;
-  else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
+  else if ((const_dup = aarch64_simd_dup_constant (vals, check)) != NULL_RTX)
     /* Loaded using DUP.  */
     return const_dup;
   else if (const_vec != NULL_RTX)
diff --git a/gcc/config/arm/aarch-common-protos.h b/gcc/config/arm/aarch-common-protos.h
index 6be5fb1e083d7ff130386dfa181b9a0c8fd5437c..55a470d8e1410bdbcfbea084ec11b468485c1400 100644
--- a/gcc/config/arm/aarch-common-protos.h
+++ b/gcc/config/arm/aarch-common-protos.h
@@ -133,6 +133,9 @@ struct vector_cost_table
 {
   const int alu;
   const int mult;
+  const int movi;
+  const int dup;
+  const int extract;
 };
 
 struct cpu_cost_table
diff --git a/gcc/config/arm/aarch-cost-tables.h b/gcc/config/arm/aarch-cost-tables.h
index 25ff702f01fab50d749b9a7b7b072c2be2504562..0e6a62665c7e18debc382a294a37945188fb90ef 100644
--- a/gcc/config/arm/aarch-cost-tables.h
+++ b/gcc/config/arm/aarch-cost-tables.h
@@ -122,7 +122,10 @@ const struct cpu_cost_table generic_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),	/* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -226,7 +229,10 @@ const struct cpu_cost_table cortexa53_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),	/* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -330,7 +336,10 @@ const struct cpu_cost_table cortexa57_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -434,7 +443,10 @@ const struct cpu_cost_table cortexa76_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -538,7 +550,10 @@ const struct cpu_cost_table exynosm1_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (0),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -642,7 +657,10 @@ const struct cpu_cost_table xgene1_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (2),  /* alu.  */
-    COSTS_N_INSNS (8)   /* mult.  */
+    COSTS_N_INSNS (8),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c
new file mode 100644
index 0000000000000000000000000000000000000000..36e468aacfadd7701c6a7cd432bee81472111a16
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c
@@ -0,0 +1,127 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -march=armv8.2-a+crypto -fno-schedule-insns -fno-schedule-insns2 -mcmodel=small" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <arm_neon.h>
+
+/*
+**test0:
+**	movi	v2.16b, 0x3
+**	ldr	q0, \[x0\]
+**	uxtl	v1.8h, v0.8b
+**	uxtl2	v0.8h, v0.16b
+**	ldr	q3, \[x1\]
+**	umlal	v1.8h, v3.8b, v2.8b
+**	umlal2	v0.8h, v3.16b, v2.16b
+**	addhn	v0.8b, v1.8h, v0.8h
+**	str	d0, \[x2\]
+**	ret
+*/
+
+void test0 (uint8_t *inptr0, uint8_t *inptr1, uint8_t *outptr0)
+{
+  uint8x16_t three_u8 = vdupq_n_u8(3);
+  uint8x16_t x = vld1q_u8(inptr0);
+  uint8x16_t y = vld1q_u8(inptr1);
+  uint16x8_t x_l = vmovl_u8(vget_low_u8(x));
+  uint16x8_t x_h = vmovl_u8(vget_high_u8(x));
+  uint16x8_t z_l = vmlal_u8(x_l, vget_low_u8(y), vget_low_u8(three_u8));
+  uint16x8_t z_h = vmlal_u8(x_h, vget_high_u8(y), vget_high_u8(three_u8));
+  vst1_u8(outptr0, vaddhn_u16(z_l, z_h));
+}
+
+/*
+**test1:
+**	sub	sp, sp, #16
+**	adrp	x2, .LC0
+**	ldr	q1, \[x2, #:lo12:.LC0\]
+**	add	v0.2d, v1.2d, v0.2d
+**	str	q0, \[x1\]
+**	fmov	x1, d1
+**	orr	x0, x0, x1
+**	add	sp, sp, 16
+**	ret
+*/
+
+uint64_t
+test1 (uint64_t a, uint64x2_t b, uint64x2_t* rt)
+{
+  uint64_t arr[2] = { 0x0942430810234076UL, 0x0942430810234076UL};
+  uint64_t res = a | arr[0];
+  uint64x2_t val = vld1q_u64 (arr);
+  *rt = vaddq_u64 (val, b);
+  return res;
+}
+
+/*
+**test2:
+**	adrp	x2, .LC1
+**	ldr	q1, \[x2, #:lo12:.LC1\]
+**	add	v0.2d, v0.2d, v1.2d
+**	str	q0, \[x1\]
+**	fmov	x1, d1
+**	orr	x0, x0, x1
+**	ret
+*/
+
+uint64_t
+test2 (uint64_t a, uint64x2_t b, uint64x2_t* rt)
+{
+  uint64x2_t val = vdupq_n_u64 (0x0424303242234076UL);
+  uint64_t arr = vgetq_lane_u64 (val, 0);
+  uint64_t res = a | arr;
+  *rt = vaddq_u64 (val, b);
+  return res;
+}
+
+/*
+**test3:
+**	sub	sp, sp, #16
+**	adrp	x2, .LC2
+**	ldr	q1, \[x2, #:lo12:.LC2\]
+**	add	v0.4s, v1.4s, v0.4s
+**	str	q0, \[x1\]
+**	fmov	w1, s1
+**	orr	w0, w0, w1
+**	add	sp, sp, 16
+**	ret
+*/
+
+uint32_t
+test3 (uint32_t a, uint32x4_t b, uint32x4_t* rt)
+{
+  uint32_t arr[4] = { 0x094243, 0x094243, 0x094243, 0x094243 };
+  uint32_t res = a | arr[0];
+  uint32x4_t val = vld1q_u32 (arr);
+  *rt = vaddq_u32 (val, b);
+  return res;
+}
+
+/*
+**test4:
+**	ushr	v0.16b, v0.16b, 7
+**	mov	x0, 16512
+**	movk	x0, 0x1020, lsl 16
+**	movk	x0, 0x408, lsl 32
+**	movk	x0, 0x102, lsl 48
+**	fmov	d1, x0
+**	pmull	v2.1q, v0.1d, v1.1d
+**	dup	v1.2d, v1.d\[0\]
+**	pmull2	v0.1q, v0.2d, v1.2d
+**	trn2	v2.8b, v2.8b, v0.8b
+**	umov	w0, v2.h\[3\]
+**	ret
+*/
+
+uint64_t
+test4 (uint8x16_t input)
+{
+    uint8x16_t bool_input = vshrq_n_u8(input, 7);
+    poly64x2_t mask = vdupq_n_p64(0x0102040810204080UL);
+    poly64_t prodL = vmull_p64((poly64_t)vgetq_lane_p64((poly64x2_t)bool_input, 0),
+                               vgetq_lane_p64(mask, 0));
+    poly64_t prodH = vmull_high_p64((poly64x2_t)bool_input, mask);
+    uint8x8_t res = vtrn2_u8((uint8x8_t)prodL, (uint8x8_t)prodH);
+    return vget_lane_u16((uint16x4_t)res, 3);
+}
+

Reply via email to