RE: [PATCH 2/2]AArch64: Add better costing for vector constants and operations

Tamar Christina via Gcc-patches Wed, 08 Sep 2021 05:58:42 -0700


> -----Original Message-----
> From: Richard Sandiford <richard.sandif...@arm.com>
> Sent: Tuesday, August 31, 2021 7:38 PM
> To: Tamar Christina <tamar.christ...@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <n...@arm.com>; Richard Earnshaw
> <richard.earns...@arm.com>; Marcus Shawcroft
> <marcus.shawcr...@arm.com>; Kyrylo Tkachov <kyrylo.tkac...@arm.com>
> Subject: Re: [PATCH 2/2]AArch64: Add better costing for vector constants
> and operations
> 
> Tamar Christina <tamar.christ...@arm.com> writes:
> >> -----Original Message-----
> >> From: Richard Sandiford <richard.sandif...@arm.com>
> >> Sent: Tuesday, August 31, 2021 5:07 PM
> >> To: Tamar Christina <tamar.christ...@arm.com>
> >> Cc: gcc-patches@gcc.gnu.org; nd <n...@arm.com>; Richard Earnshaw
> >> <richard.earns...@arm.com>; Marcus Shawcroft
> >> <marcus.shawcr...@arm.com>; Kyrylo Tkachov
> <kyrylo.tkac...@arm.com>
> >> Subject: Re: [PATCH 2/2]AArch64: Add better costing for vector
> >> constants and operations
> >>
> >> Tamar Christina <tamar.christ...@arm.com> writes:
> >> >> -----Original Message-----
> >> >> From: Richard Sandiford <richard.sandif...@arm.com>
> >> >> Sent: Tuesday, August 31, 2021 4:14 PM
> >> >> To: Tamar Christina <tamar.christ...@arm.com>
> >> >> Cc: gcc-patches@gcc.gnu.org; nd <n...@arm.com>; Richard Earnshaw
> >> >> <richard.earns...@arm.com>; Marcus Shawcroft
> >> >> <marcus.shawcr...@arm.com>; Kyrylo Tkachov
> >> <kyrylo.tkac...@arm.com>
> >> >> Subject: Re: [PATCH 2/2]AArch64: Add better costing for vector
> >> >> constants and operations
> >> >>
> >> >> Tamar Christina <tamar.christ...@arm.com> writes:
> >> >> > @@ -13936,8 +13937,65 @@ cost_plus:
> >> >> >                            mode, MULT, 1, speed);
> >> >> >            return true;
> >> >> >          }
> >> >> > +     break;
> >> >> > +    case PARALLEL:
> >> >> > +      /* Fall through */
> >> >>
> >> >> Which code paths lead to getting a PARALLEL here?
> >> >
> >> > Hi,
> >> >
> >> > Thanks for the review!
> >> >
> >> > I added it for completeness because CSE treats a parallel and
> >> > CONST_VECTOR as equivalent when they each entry in the parallel
> >> > defines
> >> a constant.
> >>
> >> Could you test whether it ever triggers in practice though?
> >> The code would be much simpler without it.
> >
> > Will check 😊


Looks like for AArch64 there's no real way for this to happen so I've removed 
this case.


> >
> >>
> >> >> > +    case CONST_VECTOR:
> >> >> > +     {
> >> >> > +       rtx gen_insn = aarch64_simd_make_constant (x, true);
> >> >> > +       /* Not a valid const vector.  */
> >> >> > +       if (!gen_insn)
> >> >> > +         break;
> >> >> >
> >> >> > -      /* Fall through.  */
> >> >> > +       switch (GET_CODE (gen_insn))
> >> >> > +       {
> >> >> > +       case CONST_VECTOR:
> >> >> > +         /* Load using MOVI/MVNI.  */
> >> >> > +         if (aarch64_simd_valid_immediate (x, NULL))
> >> >> > +           *cost += extra_cost->vect.movi;
> >> >> > +         else /* Load using constant pool.  */
> >> >> > +           *cost += extra_cost->ldst.load;
> >> >> > +         break;
> >> >> > +       /* Load using a DUP.  */
> >> >> > +       case VEC_DUPLICATE:
> >> >> > +         *cost += extra_cost->vect.dup;
> >> >> > +         break;
> >> >>
> >> >> Does this trigger in practice?  The new check==true path (rightly)
> >> >> stops the duplicated element from being forced into a register,
> >> >> but then I would have
> >> >> expected:
> >> >>
> >> >> rtx
> >> >> gen_vec_duplicate (machine_mode mode, rtx x) {
> >> >>   if (valid_for_const_vector_p (mode, x))
> >> >>     return gen_const_vec_duplicate (mode, x);
> >> >>   return gen_rtx_VEC_DUPLICATE (mode, x); }
> >> >>
> >> >> to generate the original CONST_VECTOR again.
> >> >
> >> > Yes, but CSE is trying to see whether using a DUP is cheaper than
> >> > another
> >> instruction.
> >> > Normal code won't hit this but CSE is just costing all the
> >> > different ways one can semantically construct a vector, which RTL
> >> > actually comes out
> >> of it depends on how it's folded as you say.
> >>
> >> But what I mean is, you call:
> >>
> >>      rtx gen_insn = aarch64_simd_make_constant (x, true);
> >>      /* Not a valid const vector.  */
> >>      if (!gen_insn)
> >>        break;
> >>
> >> where aarch64_simd_make_constant does:
> >>
> >>   if (CONST_VECTOR_P (vals))
> >>     const_vec = vals;
> >>   else if (GET_CODE (vals) == PARALLEL)
> >>     {
> >>       /* A CONST_VECTOR must contain only CONST_INTs and
> >>     CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
> >>     Only store valid constants in a CONST_VECTOR.  */
> >>       int n_elts = XVECLEN (vals, 0);
> >>       for (i = 0; i < n_elts; ++i)
> >>    {
> >>      rtx x = XVECEXP (vals, 0, i);
> >>      if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
> >>        n_const++;
> >>    }
> >>       if (n_const == n_elts)
> >>    const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
> >>     }
> >>   else
> >>     gcc_unreachable ();
> >>
> >>   if (const_vec != NULL_RTX
> >>       && aarch64_simd_valid_immediate (const_vec, NULL))
> >>     /* Load using MOVI/MVNI.  */
> >>     return const_vec;
> >>   else if ((const_dup = aarch64_simd_dup_constant (vals, check)) !=
> >> NULL_RTX)
> >>     /* Loaded using DUP.  */
> >>     return const_dup;
> >>
> >> and aarch64_simd_dup_constant does:
> >>
> >>   machine_mode mode = GET_MODE (vals);
> >>   machine_mode inner_mode = GET_MODE_INNER (mode);
> >>   rtx x;
> >>
> >>   if (!const_vec_duplicate_p (vals, &x))
> >>     return NULL_RTX;
> >>
> >>   /* We can load this constant by using DUP and a constant in a
> >>      single ARM register.  This will be cheaper than a vector
> >>      load.  */
> >>   if (!check)
> >>     x = copy_to_mode_reg (inner_mode, x);
> >>   return gen_vec_duplicate (mode, x);
> >>
> >> For the “check” case, “x” will be a constant, and so
> >> gen_vec_duplicate will call gen_const_vec_duplicate, which will return a
> CONST_VECTOR.
> >> It didn't seem to be possible for gen_insn to be a VEC_DUPLICATE.
> >>
> >
> > Yes, but CSE can ask the cost of a VEC_DUPLICATE directly on a
> > register without going through gen_const_vec_duplicate which is
> > intended as the gen_ functions can have side effects (e.g. creating
> > new psuedos etc)
> >
> > If say it sees a constant x and a vector [x x x x] it wants to know
> > what the cost keeping x and materializing [x x x x] vs doing a duplicate of 
> > x
> into [x x x x] is.
> >
> > In this case since both the constant and the vectors are needed you
> > won't get a constant there but a register so you'll actually see a vec_dup. 
> > If
> CSE pushes in the constant that would defeat the point 😊. Right now it's
> CSE that's pushing constants of vec_dup into vec_constants.
> >
> > My change is making it explicitly ask for the cost of doing this
> > instead of assuming it always cheaper because for a large majority of cases
> it's not actually cheaper and is highly dependent on the targets ability to
> create said constant.
> >
> > So this hook will see both versions, the dup of the register and the
> vec_constant while CSE is trying to decide which one to keep.
> 
> But the code I quoted above is from:
> 
> +     break;
> +    case PARALLEL:
> +      /* Fall through */
> +    case CONST_VECTOR:
> +     {
> +       rtx gen_insn = aarch64_simd_make_constant (x, true);
> +       /* Not a valid const vector.  */
> +       if (!gen_insn)
> +         break;
> 
> -      /* Fall through.  */
> +       switch (GET_CODE (gen_insn))
> +       {
> +       case CONST_VECTOR:
> +         /* Load using MOVI/MVNI.  */
> +         if (aarch64_simd_valid_immediate (x, NULL))
> +           *cost += extra_cost->vect.movi;
> +         else /* Load using constant pool.  */
> +           *cost += extra_cost->ldst.load;
> +         break;
> +       /* Load using a DUP.  */
> +       case VEC_DUPLICATE:
> +         *cost += extra_cost->vect.dup;
> +         break;
> +       default:
> +         *cost += extra_cost->ldst.load;
> +         break;
> +       }
> +       return true;
> +     }
> 
> Here, CSE is passing in a PARALLEL or a CONST_VECTOR.  That rtx then gets
> passed to aarch64_simd_make_constant.  We then switch based on the
> result of aarch64_simd_make_constant, with a case statement for
> VEC_DUPLICATE.  So the code is handling a case in which
> aarch64_simd_make_constant converts a PARALLEL or a CONST_VECTOR
> (passed by CSE) into a VEC_DUPLICATE.  For the reasons above, that doesn't
> seem to be possible.  aarch64_simd_make_constant would return duplicated
> constants as a CONST_VECTOR rather than a VEC_DUPLICATE.
> 
> It sounds like you're talking about the separate top-level VEC_DUPLICATE
> case, which is obviously OK/needed.

Yes my apologies, I had completely misunderstood the quoted context ☹
This case can indeed not happen.  I've placed an assert there in case someone
changes that function in the future.

> 
> Maybe it would be better to turn it around and say: do you have a case in
> which the nested VEC_DUPLICATE case above is reached?
> 
> >> This would be much simpler if we could call
> >> aarch64_simd_valid_immediate and aarch64_simd_dup_constant directly
> >> from the rtx cost code,
> 
> BTW, I meant const_vec_duplicate_p here. sorry.
> 
> > Agreed... I tried to separate them before, but the logic was annoying
> > to split and I thought not worth the effort, so instead I just changed it to
> have a checking only mode.
> >
> >> hence the
> >> question about whether the PARALLEL stuff was really needed in practice.
> >>
> >> >> > +       default:
> >> >> > +         *cost += extra_cost->ldst.load;
> >> >> > +         break;
> >> >> > +       }
> >> >> > +       return true;
> >> >> > +     }
> >> >> > +    case VEC_CONCAT:
> >> >> > +     /* depending on the operation, either DUP or INS.
> >> >> > +        For now, keep default costing.  */
> >> >> > +     break;
> >> >> > +    case VEC_DUPLICATE:
> >> >> > +     *cost += extra_cost->vect.dup;
> >> >> > +     return true;
> >> >> > +    case VEC_SELECT:
> >> >> > +     {
> >> >> > +       /* cost subreg of 0 as free, otherwise as DUP */
> >> >> > +       rtx op1 = XEXP (x, 1);
> >> >> > +       int nelts;
> >> >> > +       if ((op1 == const0_rtx && !BYTES_BIG_ENDIAN)
> >> >> > +           || (BYTES_BIG_ENDIAN
> >> >> > +               && GET_MODE_NUNITS (mode).is_constant(&nelts)
> >> >> > +               && INTVAL (op1) == nelts - 1))
> >> >> > +         ;
> >> >> > +       else if (vec_series_lowpart_p (mode, GET_MODE (op1),
> op1))
> >> >> > +         ;
> >> >> > +       else if (vec_series_highpart_p (mode, GET_MODE (op1),
> op1))
> >> >> > +       /* Selecting the high part is not technically free, but we 
> >> >> > lack
> >> >> > +          enough information to decide that here.  For instance
> selecting
> >> >> > +          the high-part of a vec_dup *is* free or to feed into any
> _high
> >> >> > +          instruction.   Both of which we can't really tell.  That 
> >> >> > said
> >> >> > +          have a better chance to optimize an dup vs multiple
> constants.  */
> >> >> > +         ;
> >> >>
> >> >> Not sure about this.  We already try to detect the latter case
> >> >> (_high
> >> >> instructions) via aarch64_strip_extend_vec_half.  We might be
> >> >> missing some cases, but that still feels like the right way to go IMO.
> >> >
> >> > That's a different problem from what I understand.  What this is
> >> > trying to say is that If you have a vector [x y a b] and you need
> >> > vector [x y] that you can use the top part of the original vector for 
> >> > this.
> >> >
> >> > This is an approximation, because something that can be created
> >> > with a movi is probably Cheaper to keep distinct if it's not going
> >> > to be paired with a
> >> _high operation (since you will have a dup then).
> >> >
> >> > The problem is that the front end has already spit the two Vectors
> >> > into [x y
> >> a b] and [x y].
> >> > There's nothing else that tries to consolidate them back up if both
> survive.
> >> >
> >> > As a consequence of this, the testcase test0 is not handled optimally.
> >> > It would instead create
> >> > 2 vectors, both of movi 0x3, just one being 64-bits and one being 128-
> bits.
> >> >
> >> > So if the cost of selecting it is cheaper than the movi, cse will
> >> > not consolidate the vectors, and because movi's are so cheap, the
> >> > only cost that worked was 0.  But increasing the costs of movi's
> >> > requires the
> >> costs of everything to be increased (including loads).
> >> >
> >> > I preferred to 0 out the cost, because the worst that can happen is
> >> > an dup instead of a movi, And at best a dup instead of a load from
> >> > a pool (if
> >> the constant is complicated).
> >>
> >> Hmm, will need to look at this more tomorrow.
> >>
> >> >> Selecting the high part of a vec_dup should get folded into
> >> >> another
> >> vec_dup.
> >> >>
> >> >> The lowpart bits look OK, but which paths call this function
> >> >> without first simplifying the select to a subreg?  The subreg is
> >> >> now the canonical form (thanks to r12-2288).
> >> >
> >> > The simplification will happen during folding in cse or in combine.
> >> > This costing happens before the folding, When CSE is trying to
> >> > decide
> >> whether to undo the front end's lowering of constants.
> >> >
> >> > To do so it models the constants and the semantic operation
> >> > required to extract them. E.g. to get
> >> > 2 out of [0 2 4 5] it would need a VEC_SELECT of 1. And I don't
> >> > treat the first element/bottom part special Here.  Costing wise
> >> > they would be
> >> the same.
> >>
> >> But which code path creates the VEC_SELECT?  We don't need any
> >> context to know that the VEC_SELECT is non-canonical.  It's obvious
> >> from the operands of the VEC_SELECT in isolation.
> >
> > The non-cannonical RTL is never generated. I assume we're talking
> > about the 0 case here Since subregs can't select arbitrary elements (as I
> asked before).
> >
> > For the 0 case it's only temporarily modelled as such as such to keep the
> CSE alternative costing simple.
> > Currently it's just a for loop for I = 0 to vec_elems.
> 
> Ah, sorry, I see now that you're talking about the 1/2 patch.
> I looked at this one first :-)
> 
> > When it comes time to generate the actual insn fold_rtx is called
> > which will fold the VEC_SELECT Into a subreg.
> >
> > So it's never emitted into the instruction stream in its non canonical form.
> >
> >>
> >> I'd just rather tackle this at source than try to get the cost code
> >> to handle non-canonical rtl.
> >
> > If that's what is preferred I can change the CSE patch to generate a
> > subreg for the 0 case, I'm not sure I agree with it as CSE is just
> > trying to ask "what Is the cost of selecting the element 0 in this vector".
> And as I mentioned before it never emits the instruction unfolded.  This
> representation seems to a more logical representation for costing to me.
> 
> I think it's better to cost what we intend to generate.  Otherwise each target
> needs to handle both forms: “CSE asks about this, but actually intends to
> generate that instead”.
> 
> > It's however unfortunate that there's only one costing callback, as
> > far as CSE is concerned the representation/form doesn't matter, it's just
> looking at the high level operation.
> >
> > Or is the concern here that most targets will have costing for subreg
> > 0 but not VEC_SELECT? In which case without Actually handling the
> > costs of the other operations the CSE changes won't do anything for targets
> anyway.  And it would be odd for a target to cost VEC_SELECT from 1 to <N>
> instead of just costing 0 too.
> 
> Well, even for the motivating target (aarch64), we had to make changes to
> treat index 0 as especially cheap.  That's likely to be necessary on other
> targets too, if they want to take advantage of this.  The for loop exists
> because the index matters.
> 
> I'm still a bit sceptical about treating the high-part cost as lower.
> ISTM that the subreg cases are the ones that are truly “free” and any others
> should have a normal cost.  So if CSE handled the subreg case itself (to model
> how the rtx would actually be generated) then aarch64 code would have to
> do less work.  I imagine that will be true for other targets as well.

I guess the main problem is that CSE lacks context because it's not until after
combine that the high part becomes truly "free" when pushed into a high 
operation.

For CSE I don't think there's any real point in rematerializing the constant 
twice unless
It's needed on a different part of the register file. So in the int (v2si) 
case, if the lowpart was
Needed on the SIMD side but the high on the genreg side, it's most likely 
cheaper to create
Int on the integer side using mov/movk then doing a transfer from simd to 
genreg.

But I believe this to be an outlier, in all other cases, having a DUP, which 
may get removed
should be better then rematerializing.

I think the question Is, can other passes that use the cost model could this be 
problematic.
The only other pass that could use this is combine I think, which would run 
after cse1.

But say cse1 didn't handle it, normally when costing a high operation we cost 
the register usage
as free anyway, so nothing changes.

To the best of my reasoning I think it's safe/beneficial.  But I can run 
benchmarks on some of our
Intrinsics heavy code if that would help.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

        * config/arm/aarch-common-protos.h (struct vector_cost_table): Add
        movi, dup and extract costing fields.
        * config/aarch64/aarch64-cost-tables.h (qdf24xx_extra_costs,
        thunderx_extra_costs, thunderx2t99_extra_costs,
        thunderx3t110_extra_costs, tsv110_extra_costs, a64fx_extra_costs): Use
        them.
        * config/arm/aarch-cost-tables.h (generic_extra_costs,
        cortexa53_extra_costs, cortexa57_extra_costs, cortexa76_extra_costs,
        exynosm1_extra_costs, xgene1_extra_costs): Likewise
        * config/aarch64/aarch64-simd.md (aarch64_simd_dup<mode>): Add r->w dup.
        * config/aarch64/aarch64.c (aarch64_simd_make_constant): Expose.
        (aarch64_rtx_costs): Add extra costs.
        (aarch64_simd_dup_constant): Support check only mode.

gcc/testsuite/ChangeLog:

        * gcc.target/aarch64/vect-cse-codegen.c: New test.

--- inline copy of patch ---

diff --git a/gcc/config/aarch64/aarch64-cost-tables.h 
b/gcc/config/aarch64/aarch64-cost-tables.h
index 
dd2e7e7cbb13d24f0b51092270cd7e2d75fabf29..bb499a1eae62a145f1665d521f57c98b49ac5389
 100644
--- a/gcc/config/aarch64/aarch64-cost-tables.h
+++ b/gcc/config/aarch64/aarch64-cost-tables.h
@@ -124,7 +124,10 @@ const struct cpu_cost_table qdf24xx_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -229,7 +232,10 @@ const struct cpu_cost_table thunderx_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1), /* Alu.  */
-    COSTS_N_INSNS (4)  /* mult.  */
+    COSTS_N_INSNS (4), /* mult.  */
+    COSTS_N_INSNS (1), /* movi.  */
+    COSTS_N_INSNS (2), /* dup.  */
+    COSTS_N_INSNS (2)  /* extract.  */
   }
 };
 
@@ -333,7 +339,10 @@ const struct cpu_cost_table thunderx2t99_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1), /* Alu.  */
-    COSTS_N_INSNS (4)  /* Mult.  */
+    COSTS_N_INSNS (4), /* Mult.  */
+    COSTS_N_INSNS (1), /* movi.  */
+    COSTS_N_INSNS (2), /* dup.  */
+    COSTS_N_INSNS (2)  /* extract.  */
   }
 };
 
@@ -437,7 +446,10 @@ const struct cpu_cost_table thunderx3t110_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1), /* Alu.  */
-    COSTS_N_INSNS (4)  /* Mult.  */
+    COSTS_N_INSNS (4), /* Mult.  */
+    COSTS_N_INSNS (1), /* movi.  */
+    COSTS_N_INSNS (2), /* dup.  */
+    COSTS_N_INSNS (2)  /* extract.  */
   }
 };
 
@@ -542,7 +554,10 @@ const struct cpu_cost_table tsv110_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -646,7 +661,10 @@ const struct cpu_cost_table a64fx_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 
48eddf64e05afe3788abfa05141f6544a9323ea1..371990fbe2cfb72d22f22ed582bb7ebdebb3edc0
 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -74,12 +74,14 @@ (define_insn "aarch64_simd_dup<mode>"
 )
 
 (define_insn "aarch64_simd_dup<mode>"
-  [(set (match_operand:VDQF_F16 0 "register_operand" "=w")
+  [(set (match_operand:VDQF_F16 0 "register_operand" "=w,w")
        (vec_duplicate:VDQF_F16
-         (match_operand:<VEL> 1 "register_operand" "w")))]
+         (match_operand:<VEL> 1 "register_operand" "w,r")))]
   "TARGET_SIMD"
-  "dup\\t%0.<Vtype>, %1.<Vetype>[0]"
-  [(set_attr "type" "neon_dup<q>")]
+  "@
+   dup\\t%0.<Vtype>, %1.<Vetype>[0]
+   dup\\t%0.<Vtype>, %<vw>1"
+  [(set_attr "type" "neon_dup<q>, neon_from_gp<q>")]
 )
 
 (define_insn "aarch64_dup_lane<mode>"
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 
26d59ba1e13758eed47598c101fd214788637be4..483f1079f3d3967bfd16047c2b8447078c37313c
 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -303,6 +303,7 @@ static machine_mode aarch64_simd_container_mode 
(scalar_mode, poly_int64);
 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
                                            aarch64_addr_query_type);
 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
+static rtx aarch64_simd_make_constant (rtx, bool);
 
 /* Major revision number of the ARM Architecture implemented by the target.  */
 unsigned aarch64_architecture_version;
@@ -12703,7 +12704,7 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int outer 
ATTRIBUTE_UNUSED,
   rtx op0, op1, op2;
   const struct cpu_cost_table *extra_cost
     = aarch64_tune_params.insn_extra_cost;
-  int code = GET_CODE (x);
+  rtx_code code = GET_CODE (x);
   scalar_int_mode int_mode;
 
   /* By default, assume that everything has equivalent cost to the
@@ -14004,8 +14005,57 @@ cost_plus:
                             mode, MULT, 1, speed);
           return true;
         }
+       break;
+    case CONST_VECTOR:
+       {
+         rtx gen_insn = aarch64_simd_make_constant (x, true);
+         /* Not a valid const vector.  */
+         if (!gen_insn)
+           break;
 
-      /* Fall through.  */
+         switch (GET_CODE (gen_insn))
+         {
+         case CONST_VECTOR:
+           /* Load using MOVI/MVNI.  */
+           if (aarch64_simd_valid_immediate (x, NULL))
+             *cost += extra_cost->vect.movi;
+           else /* Load using constant pool.  */
+             *cost += extra_cost->ldst.load;
+           break;
+         /* Load using a DUP.  */
+         case VEC_DUPLICATE:
+           gcc_unreachable ();
+           break;
+         default:
+           *cost += extra_cost->ldst.load;
+           break;
+         }
+         return true;
+       }
+    case VEC_CONCAT:
+       /* depending on the operation, either DUP or INS.
+          For now, keep default costing.  */
+       break;
+    case VEC_DUPLICATE:
+       *cost += extra_cost->vect.dup;
+       return true;
+    case VEC_SELECT:
+       {
+         /* cost subreg of 0 as free, otherwise as DUP */
+         rtx op1 = XEXP (x, 1);
+         if (vec_series_lowpart_p (mode, GET_MODE (op1), op1))
+           ;
+         else if (vec_series_highpart_p (mode, GET_MODE (op1), op1))
+         /* Selecting the high part is not technically free, but we lack
+            enough information to decide that here.  For instance selecting
+            the high-part of a vec_dup *is* free or to feed into any _high
+            instruction.   Both of which we can't really tell.  That said
+            have a better chance to optimize an dup vs multiple constants.  */
+           ;
+         else
+           *cost += extra_cost->vect.extract;
+         return true;
+       }
     default:
       break;
     }
@@ -20634,9 +20684,12 @@ aarch64_builtin_support_vector_misalignment 
(machine_mode mode,
 
 /* If VALS is a vector constant that can be loaded into a register
    using DUP, generate instructions to do so and return an RTX to
-   assign to the register.  Otherwise return NULL_RTX.  */
+   assign to the register.  Otherwise return NULL_RTX.
+
+   If CHECK then the resulting instruction may not be used in
+   codegen but can be used for costing.  */
 static rtx
-aarch64_simd_dup_constant (rtx vals)
+aarch64_simd_dup_constant (rtx vals, bool check = false)
 {
   machine_mode mode = GET_MODE (vals);
   machine_mode inner_mode = GET_MODE_INNER (mode);
@@ -20648,7 +20701,8 @@ aarch64_simd_dup_constant (rtx vals)
   /* We can load this constant by using DUP and a constant in a
      single ARM register.  This will be cheaper than a vector
      load.  */
-  x = copy_to_mode_reg (inner_mode, x);
+  if (!check)
+    x = copy_to_mode_reg (inner_mode, x);
   return gen_vec_duplicate (mode, x);
 }
 
@@ -20656,9 +20710,12 @@ aarch64_simd_dup_constant (rtx vals)
 /* Generate code to load VALS, which is a PARALLEL containing only
    constants (for vec_init) or CONST_VECTOR, efficiently into a
    register.  Returns an RTX to copy into the register, or NULL_RTX
-   for a PARALLEL that cannot be converted into a CONST_VECTOR.  */
+   for a PARALLEL that cannot be converted into a CONST_VECTOR.
+
+   If CHECK then the resulting instruction may not be used in
+   codegen but can be used for costing.  */
 static rtx
-aarch64_simd_make_constant (rtx vals)
+aarch64_simd_make_constant (rtx vals, bool check = false)
 {
   machine_mode mode = GET_MODE (vals);
   rtx const_dup;
@@ -20690,7 +20747,7 @@ aarch64_simd_make_constant (rtx vals)
       && aarch64_simd_valid_immediate (const_vec, NULL))
     /* Load using MOVI/MVNI.  */
     return const_vec;
-  else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
+  else if ((const_dup = aarch64_simd_dup_constant (vals, check)) != NULL_RTX)
     /* Loaded using DUP.  */
     return const_dup;
   else if (const_vec != NULL_RTX)
diff --git a/gcc/config/arm/aarch-common-protos.h 
b/gcc/config/arm/aarch-common-protos.h
index 
6be5fb1e083d7ff130386dfa181b9a0c8fd5437c..55a470d8e1410bdbcfbea084ec11b468485c1400
 100644
--- a/gcc/config/arm/aarch-common-protos.h
+++ b/gcc/config/arm/aarch-common-protos.h
@@ -133,6 +133,9 @@ struct vector_cost_table
 {
   const int alu;
   const int mult;
+  const int movi;
+  const int dup;
+  const int extract;
 };
 
 struct cpu_cost_table
diff --git a/gcc/config/arm/aarch-cost-tables.h 
b/gcc/config/arm/aarch-cost-tables.h
index 
25ff702f01fab50d749b9a7b7b072c2be2504562..0e6a62665c7e18debc382a294a37945188fb90ef
 100644
--- a/gcc/config/arm/aarch-cost-tables.h
+++ b/gcc/config/arm/aarch-cost-tables.h
@@ -122,7 +122,10 @@ const struct cpu_cost_table generic_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1), /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -226,7 +229,10 @@ const struct cpu_cost_table cortexa53_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1), /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -330,7 +336,10 @@ const struct cpu_cost_table cortexa57_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -434,7 +443,10 @@ const struct cpu_cost_table cortexa76_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -538,7 +550,10 @@ const struct cpu_cost_table exynosm1_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (0),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -642,7 +657,10 @@ const struct cpu_cost_table xgene1_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (2),  /* alu.  */
-    COSTS_N_INSNS (8)   /* mult.  */
+    COSTS_N_INSNS (8),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c 
b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c
new file mode 100644
index 
0000000000000000000000000000000000000000..36e468aacfadd7701c6a7cd432bee81472111a16
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c
@@ -0,0 +1,127 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -march=armv8.2-a+crypto -fno-schedule-insns 
-fno-schedule-insns2 -mcmodel=small" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <arm_neon.h>
+
+/*
+**test0:
+**     movi    v2.16b, 0x3
+**     ldr     q0, \[x0\]
+**     uxtl    v1.8h, v0.8b
+**     uxtl2   v0.8h, v0.16b
+**     ldr     q3, \[x1\]
+**     umlal   v1.8h, v3.8b, v2.8b
+**     umlal2  v0.8h, v3.16b, v2.16b
+**     addhn   v0.8b, v1.8h, v0.8h
+**     str     d0, \[x2\]
+**     ret
+*/
+
+void test0 (uint8_t *inptr0, uint8_t *inptr1, uint8_t *outptr0)
+{
+  uint8x16_t three_u8 = vdupq_n_u8(3);
+  uint8x16_t x = vld1q_u8(inptr0);
+  uint8x16_t y = vld1q_u8(inptr1);
+  uint16x8_t x_l = vmovl_u8(vget_low_u8(x));
+  uint16x8_t x_h = vmovl_u8(vget_high_u8(x));
+  uint16x8_t z_l = vmlal_u8(x_l, vget_low_u8(y), vget_low_u8(three_u8));
+  uint16x8_t z_h = vmlal_u8(x_h, vget_high_u8(y), vget_high_u8(three_u8));
+  vst1_u8(outptr0, vaddhn_u16(z_l, z_h));
+}
+
+/*
+**test1:
+**     sub     sp, sp, #16
+**     adrp    x2, .LC0
+**     ldr     q1, \[x2, #:lo12:.LC0\]
+**     add     v0.2d, v1.2d, v0.2d
+**     str     q0, \[x1\]
+**     fmov    x1, d1
+**     orr     x0, x0, x1
+**     add     sp, sp, 16
+**     ret
+*/
+
+uint64_t
+test1 (uint64_t a, uint64x2_t b, uint64x2_t* rt)
+{
+  uint64_t arr[2] = { 0x0942430810234076UL, 0x0942430810234076UL};
+  uint64_t res = a | arr[0];
+  uint64x2_t val = vld1q_u64 (arr);
+  *rt = vaddq_u64 (val, b);
+  return res;
+}
+
+/*
+**test2:
+**     adrp    x2, .LC1
+**     ldr     q1, \[x2, #:lo12:.LC1\]
+**     add     v0.2d, v0.2d, v1.2d
+**     str     q0, \[x1\]
+**     fmov    x1, d1
+**     orr     x0, x0, x1
+**     ret
+*/
+
+uint64_t
+test2 (uint64_t a, uint64x2_t b, uint64x2_t* rt)
+{
+  uint64x2_t val = vdupq_n_u64 (0x0424303242234076UL);
+  uint64_t arr = vgetq_lane_u64 (val, 0);
+  uint64_t res = a | arr;
+  *rt = vaddq_u64 (val, b);
+  return res;
+}
+
+/*
+**test3:
+**     sub     sp, sp, #16
+**     adrp    x2, .LC2
+**     ldr     q1, \[x2, #:lo12:.LC2\]
+**     add     v0.4s, v1.4s, v0.4s
+**     str     q0, \[x1\]
+**     fmov    w1, s1
+**     orr     w0, w0, w1
+**     add     sp, sp, 16
+**     ret
+*/
+
+uint32_t
+test3 (uint32_t a, uint32x4_t b, uint32x4_t* rt)
+{
+  uint32_t arr[4] = { 0x094243, 0x094243, 0x094243, 0x094243 };
+  uint32_t res = a | arr[0];
+  uint32x4_t val = vld1q_u32 (arr);
+  *rt = vaddq_u32 (val, b);
+  return res;
+}
+
+/*
+**test4:
+**     ushr    v0.16b, v0.16b, 7
+**     mov     x0, 16512
+**     movk    x0, 0x1020, lsl 16
+**     movk    x0, 0x408, lsl 32
+**     movk    x0, 0x102, lsl 48
+**     fmov    d1, x0
+**     pmull   v2.1q, v0.1d, v1.1d
+**     dup     v1.2d, v1.d\[0\]
+**     pmull2  v0.1q, v0.2d, v1.2d
+**     trn2    v2.8b, v2.8b, v0.8b
+**     umov    w0, v2.h\[3\]
+**     ret
+*/
+
+uint64_t
+test4 (uint8x16_t input)
+{
+    uint8x16_t bool_input = vshrq_n_u8(input, 7);
+    poly64x2_t mask = vdupq_n_p64(0x0102040810204080UL);
+    poly64_t prodL = 
vmull_p64((poly64_t)vgetq_lane_p64((poly64x2_t)bool_input, 0),
+                               vgetq_lane_p64(mask, 0));
+    poly64_t prodH = vmull_high_p64((poly64x2_t)bool_input, mask);
+    uint8x8_t res = vtrn2_u8((uint8x8_t)prodL, (uint8x8_t)prodH);
+    return vget_lane_u16((uint16x4_t)res, 3);
+}
+

rb14774.patch
Description: rb14774.patch

RE: [PATCH 2/2]AArch64: Add better costing for vector constants and operations

Reply via email to