Re: [PATCH] [AARCH64] Improve vector generation cost model

2019-12-13 Thread Kyrill Tkachov

Hi Andrew,

On 3/15/19 1:18 AM, apin...@marvell.com wrote:

From: Andrew Pinski 

Hi,
  On OcteonTX2, ld1r and ld1 (with a single lane) are split
into two different micro-ops unlike most other targets.
This adds three extra costs to the cost table:
ld1_dup: used for "ld1r {v0.4s}, [x0]"
merge_dup: used for "dup v0.4s, v0.4s[0]" and "ins v0.4s[0], v0.4s[0]"
ld1_merge: used fir "ld1 {v0.4s}[0], [x0]"

OK? Bootstrapped and tested on aarch64-linux-gnu with no regressions.



Sorry for the slow reply, missed it on gcc-patches :(



Thanks,
Andrew Pinski

ChangeLog:
* config/arm/aarch-common-protos.h (vector_cost_table):
Add merge_dup, ld1_merge, and ld1_dup.
* config/aarch64/aarch64-cost-tables.h (qdf24xx_extra_costs):
Update for the new fields.
(thunderx_extra_costs): Likewise.
(thunderx2t99_extra_costs): Likewise.
(tsv110_extra_costs): Likewise.
* config/arm/aarch-cost-tables.h (generic_extra_costs): Likewise.
(cortexa53_extra_costs): Likewise.
(cortexa57_extra_costs): Likewise.
(exynosm1_extra_costs): Likewise.
(xgene1_extra_costs): Likewise.
* config/aarch64/aarch64.c (aarch64_rtx_costs): Handle vec_dup of a 
memory.

Hanlde vec_merge of a memory.

Signed-off-by: Andrew Pinski 
---
 gcc/config/aarch64/aarch64-cost-tables.h | 20 +++
 gcc/config/aarch64/aarch64.c | 22 +
 gcc/config/arm/aarch-common-protos.h |  3 +++
 gcc/config/arm/aarch-cost-tables.h   | 25 +++-
 4 files changed, 61 insertions(+), 9 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-cost-tables.h 
b/gcc/config/aarch64/aarch64-cost-tables.h

index 5c9442e1b89..9a7c70ba595 100644
--- a/gcc/config/aarch64/aarch64-cost-tables.h
+++ b/gcc/config/aarch64/aarch64-cost-tables.h
@@ -123,7 +123,10 @@ const struct cpu_cost_table qdf24xx_extra_costs =
   },
   /* Vector */
   {
-    COSTS_N_INSNS (1)  /* alu.  */
+    COSTS_N_INSNS (1),  /* Alu.  */
+    COSTS_N_INSNS (1), /* dup_merge.  */
+    COSTS_N_INSNS (1), /* ld1_merge.  */
+    COSTS_N_INSNS (1)  /* ld1_dup.  */
   }
 };

@@ -227,7 +230,10 @@ const struct cpu_cost_table thunderx_extra_costs =
   },
   /* Vector */
   {
-    COSTS_N_INSNS (1)  /* Alu.  */
+    COSTS_N_INSNS (1), /* Alu.  */
+    COSTS_N_INSNS (1), /* dup_merge.  */
+    COSTS_N_INSNS (1), /* ld1_merge.  */
+    COSTS_N_INSNS (1)  /* ld1_dup.  */
   }
 };

@@ -330,7 +336,10 @@ const struct cpu_cost_table 
thunderx2t99_extra_costs =

   },
   /* Vector */
   {
-    COSTS_N_INSNS (1)  /* Alu.  */
+    COSTS_N_INSNS (1), /* Alu.  */
+    COSTS_N_INSNS (1), /* dup_merge.  */
+    COSTS_N_INSNS (1), /* ld1_merge.  */
+    COSTS_N_INSNS (1)  /* ld1_dup.  */
   }
 };

@@ -434,7 +443,10 @@ const struct cpu_cost_table tsv110_extra_costs =
   },
   /* Vector */
   {
-    COSTS_N_INSNS (1)  /* alu.  */
+    COSTS_N_INSNS (1), /* Alu.  */
+    COSTS_N_INSNS (1), /* dup_merge.  */
+    COSTS_N_INSNS (1), /* ld1_merge.  */
+    COSTS_N_INSNS (1)  /* ld1_dup.  */
   }
 };

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index b38505b0872..dc4d3d39af8 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -10568,6 +10568,28 @@ cost_plus:
 }
   break;

+    case VEC_DUPLICATE:
+  if (!speed)
+   return false;



If I read the code right, before this patch we would be returning true 
for !speed i.e. not recursing.


Do we want to trigger a recursion now?



+
+  if (GET_CODE (XEXP (x, 0)) == MEM)
+   *cost += extra_cost->vect.ld1_dup;



Please use MEM_P here.



+  else
+   *cost += extra_cost->vect.merge_dup;
+  return true;
+
+    case VEC_MERGE:
+  if (speed && GET_CODE (XEXP (x, 0)) == VEC_DUPLICATE)
+   {
+ if (GET_CODE (XEXP (XEXP (x, 0), 0)) == MEM)



And here.

Thanks,

Kyrill



+   *cost += extra_cost->vect.ld1_merge;
+ else
+   *cost += extra_cost->vect.merge_dup;
+ return true;
+   }
+  break;
+
+
 case TRUNCATE:

   /* Decompose muldi3_highpart.  */
diff --git a/gcc/config/arm/aarch-common-protos.h 
b/gcc/config/arm/aarch-common-protos.h

index 11cd5145bbc..dbc1282402a 100644
--- a/gcc/config/arm/aarch-common-protos.h
+++ b/gcc/config/arm/aarch-common-protos.h
@@ -131,6 +131,9 @@ struct fp_cost_table
 struct vector_cost_table
 {
   const int alu;
+  const int merge_dup;
+  const int ld1_merge;
+  const int ld1_dup;
 };

 struct cpu_cost_table
diff --git a/gcc/config/arm/aarch-cost-tables.h 
b/gcc/config/arm/aarch-cost-tables.h

index bc33efadc6c..a51bc668f56 100644
--- a/gcc/config/arm/aarch-cost-tables.h
+++ b/gcc/config/arm/aarch-cost-tables.h
@@ -121,7 +121,10 @@ const struct cpu_cost_table generic_extra_costs =
   },
   /* Vector */
   {
-    COSTS_N_INSNS (1)  /* alu.  */
+    COSTS_N_INSNS (1),  /* alu.  */
+    COSTS_N_INSNS (1), /* dup_merge.  */
+    COSTS_N_INSNS (1), /* ld1_merge.  */
+    COSTS_N_INSNS (1)  /* ld1_dup.  */
   }
 };

@@ -224,7 +227,10 @@ const struct cpu_cost_table 

Re: [PATCH] [AARCH64] Improve vector generation cost model

2019-12-07 Thread Andrew Pinski
On Thu, May 2, 2019 at 9:10 AM Andrew Pinski  wrote:
>
> On Thu, Mar 14, 2019 at 6:19 PM  wrote:
> >
> > From: Andrew Pinski 
> >
> > Hi,
> >   On OcteonTX2, ld1r and ld1 (with a single lane) are split
> > into two different micro-ops unlike most other targets.
> > This adds three extra costs to the cost table:
> > ld1_dup: used for "ld1r {v0.4s}, [x0]"
> > merge_dup: used for "dup v0.4s, v0.4s[0]" and "ins v0.4s[0], v0.4s[0]"
> > ld1_merge: used fir "ld1 {v0.4s}[0], [x0]"
> >
> > OK? Bootstrapped and tested on aarch64-linux-gnu with no regressions.
>
> Ping?  It has been 1.5 months now.

Ping?  I have bootstrapped and tested on aarch64-linux-gnu recently
with the patch.
Or does this has to wait until Stage 1?

Thanks,
Andrew

>
> >
> > Thanks,
> > Andrew Pinski
> >
> > ChangeLog:
> > * config/arm/aarch-common-protos.h (vector_cost_table):
> > Add merge_dup, ld1_merge, and ld1_dup.
> > * config/aarch64/aarch64-cost-tables.h (qdf24xx_extra_costs):
> > Update for the new fields.
> > (thunderx_extra_costs): Likewise.
> > (thunderx2t99_extra_costs): Likewise.
> > (tsv110_extra_costs): Likewise.
> > * config/arm/aarch-cost-tables.h (generic_extra_costs): Likewise.
> > (cortexa53_extra_costs): Likewise.
> > (cortexa57_extra_costs): Likewise.
> > (exynosm1_extra_costs): Likewise.
> > (xgene1_extra_costs): Likewise.
> > * config/aarch64/aarch64.c (aarch64_rtx_costs): Handle vec_dup of a memory.
> > Hanlde vec_merge of a memory.
> >
> > Signed-off-by: Andrew Pinski 
> > ---
> >  gcc/config/aarch64/aarch64-cost-tables.h | 20 +++
> >  gcc/config/aarch64/aarch64.c | 22 +
> >  gcc/config/arm/aarch-common-protos.h |  3 +++
> >  gcc/config/arm/aarch-cost-tables.h   | 25 +++-
> >  4 files changed, 61 insertions(+), 9 deletions(-)
> >
> > diff --git a/gcc/config/aarch64/aarch64-cost-tables.h 
> > b/gcc/config/aarch64/aarch64-cost-tables.h
> > index 5c9442e1b89..9a7c70ba595 100644
> > --- a/gcc/config/aarch64/aarch64-cost-tables.h
> > +++ b/gcc/config/aarch64/aarch64-cost-tables.h
> > @@ -123,7 +123,10 @@ const struct cpu_cost_table qdf24xx_extra_costs =
> >},
> >/* Vector */
> >{
> > -COSTS_N_INSNS (1)  /* alu.  */
> > +COSTS_N_INSNS (1),  /* Alu.  */
> > +COSTS_N_INSNS (1), /* dup_merge.  */
> > +COSTS_N_INSNS (1), /* ld1_merge.  */
> > +COSTS_N_INSNS (1)  /* ld1_dup.  */
> >}
> >  };
> >
> > @@ -227,7 +230,10 @@ const struct cpu_cost_table thunderx_extra_costs =
> >},
> >/* Vector */
> >{
> > -COSTS_N_INSNS (1)  /* Alu.  */
> > +COSTS_N_INSNS (1), /* Alu.  */
> > +COSTS_N_INSNS (1), /* dup_merge.  */
> > +COSTS_N_INSNS (1), /* ld1_merge.  */
> > +COSTS_N_INSNS (1)  /* ld1_dup.  */
> >}
> >  };
> >
> > @@ -330,7 +336,10 @@ const struct cpu_cost_table thunderx2t99_extra_costs =
> >},
> >/* Vector */
> >{
> > -COSTS_N_INSNS (1)  /* Alu.  */
> > +COSTS_N_INSNS (1), /* Alu.  */
> > +COSTS_N_INSNS (1), /* dup_merge.  */
> > +COSTS_N_INSNS (1), /* ld1_merge.  */
> > +COSTS_N_INSNS (1)  /* ld1_dup.  */
> >}
> >  };
> >
> > @@ -434,7 +443,10 @@ const struct cpu_cost_table tsv110_extra_costs =
> >},
> >/* Vector */
> >{
> > -COSTS_N_INSNS (1)  /* alu.  */
> > +COSTS_N_INSNS (1), /* Alu.  */
> > +COSTS_N_INSNS (1), /* dup_merge.  */
> > +COSTS_N_INSNS (1), /* ld1_merge.  */
> > +COSTS_N_INSNS (1)  /* ld1_dup.  */
> >}
> >  };
> >
> > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> > index b38505b0872..dc4d3d39af8 100644
> > --- a/gcc/config/aarch64/aarch64.c
> > +++ b/gcc/config/aarch64/aarch64.c
> > @@ -10568,6 +10568,28 @@ cost_plus:
> >  }
> >break;
> >
> > +case VEC_DUPLICATE:
> > +  if (!speed)
> > +   return false;
> > +
> > +  if (GET_CODE (XEXP (x, 0)) == MEM)
> > +   *cost += extra_cost->vect.ld1_dup;
> > +  else
> > +   *cost += extra_cost->vect.merge_dup;
> > +  return true;
> > +
> > +case VEC_MERGE:
> > +  if (speed && GET_CODE (XEXP (x, 0)) == VEC_DUPLICATE)
> > +   {
> > + if (GET_CODE (XEXP (XEXP (x, 0), 0)) == MEM)
> > +   *cost += extra_cost->vect.ld1_merge;
> > + else
> > +   *cost += extra_cost->vect.merge_dup;
> > + return true;
> > +   }
> > +  break;
> > +
> > +
> >  case TRUNCATE:
> >
> >/* Decompose muldi3_highpart.  */
> > diff --git a/gcc/config/arm/aarch-common-protos.h 
> > b/gcc/config/arm/aarch-common-protos.h
> > index 11cd5145bbc..dbc1282402a 100644
> > --- a/gcc/config/arm/aarch-common-protos.h
> > +++ b/gcc/config/arm/aarch-common-protos.h
> > @@ -131,6 +131,9 @@ struct fp_cost_table
> >  struct vector_cost_table
> >  {
> >const int alu;
> > +  const int merge_dup;
> > +  const int ld1_merge;
> > +  const int ld1_dup;
> >  };
> >
> >  struct cpu_cost_table
> > diff --git a/gcc/config/arm/aarch-cost-tables.h 
> > 

Re: [PATCH] [AARCH64] Improve vector generation cost model

2019-05-02 Thread Andrew Pinski
On Thu, Mar 14, 2019 at 6:19 PM  wrote:
>
> From: Andrew Pinski 
>
> Hi,
>   On OcteonTX2, ld1r and ld1 (with a single lane) are split
> into two different micro-ops unlike most other targets.
> This adds three extra costs to the cost table:
> ld1_dup: used for "ld1r {v0.4s}, [x0]"
> merge_dup: used for "dup v0.4s, v0.4s[0]" and "ins v0.4s[0], v0.4s[0]"
> ld1_merge: used fir "ld1 {v0.4s}[0], [x0]"
>
> OK? Bootstrapped and tested on aarch64-linux-gnu with no regressions.

Ping?  It has been 1.5 months now.

>
> Thanks,
> Andrew Pinski
>
> ChangeLog:
> * config/arm/aarch-common-protos.h (vector_cost_table):
> Add merge_dup, ld1_merge, and ld1_dup.
> * config/aarch64/aarch64-cost-tables.h (qdf24xx_extra_costs):
> Update for the new fields.
> (thunderx_extra_costs): Likewise.
> (thunderx2t99_extra_costs): Likewise.
> (tsv110_extra_costs): Likewise.
> * config/arm/aarch-cost-tables.h (generic_extra_costs): Likewise.
> (cortexa53_extra_costs): Likewise.
> (cortexa57_extra_costs): Likewise.
> (exynosm1_extra_costs): Likewise.
> (xgene1_extra_costs): Likewise.
> * config/aarch64/aarch64.c (aarch64_rtx_costs): Handle vec_dup of a memory.
> Hanlde vec_merge of a memory.
>
> Signed-off-by: Andrew Pinski 
> ---
>  gcc/config/aarch64/aarch64-cost-tables.h | 20 +++
>  gcc/config/aarch64/aarch64.c | 22 +
>  gcc/config/arm/aarch-common-protos.h |  3 +++
>  gcc/config/arm/aarch-cost-tables.h   | 25 +++-
>  4 files changed, 61 insertions(+), 9 deletions(-)
>
> diff --git a/gcc/config/aarch64/aarch64-cost-tables.h 
> b/gcc/config/aarch64/aarch64-cost-tables.h
> index 5c9442e1b89..9a7c70ba595 100644
> --- a/gcc/config/aarch64/aarch64-cost-tables.h
> +++ b/gcc/config/aarch64/aarch64-cost-tables.h
> @@ -123,7 +123,10 @@ const struct cpu_cost_table qdf24xx_extra_costs =
>},
>/* Vector */
>{
> -COSTS_N_INSNS (1)  /* alu.  */
> +COSTS_N_INSNS (1),  /* Alu.  */
> +COSTS_N_INSNS (1), /* dup_merge.  */
> +COSTS_N_INSNS (1), /* ld1_merge.  */
> +COSTS_N_INSNS (1)  /* ld1_dup.  */
>}
>  };
>
> @@ -227,7 +230,10 @@ const struct cpu_cost_table thunderx_extra_costs =
>},
>/* Vector */
>{
> -COSTS_N_INSNS (1)  /* Alu.  */
> +COSTS_N_INSNS (1), /* Alu.  */
> +COSTS_N_INSNS (1), /* dup_merge.  */
> +COSTS_N_INSNS (1), /* ld1_merge.  */
> +COSTS_N_INSNS (1)  /* ld1_dup.  */
>}
>  };
>
> @@ -330,7 +336,10 @@ const struct cpu_cost_table thunderx2t99_extra_costs =
>},
>/* Vector */
>{
> -COSTS_N_INSNS (1)  /* Alu.  */
> +COSTS_N_INSNS (1), /* Alu.  */
> +COSTS_N_INSNS (1), /* dup_merge.  */
> +COSTS_N_INSNS (1), /* ld1_merge.  */
> +COSTS_N_INSNS (1)  /* ld1_dup.  */
>}
>  };
>
> @@ -434,7 +443,10 @@ const struct cpu_cost_table tsv110_extra_costs =
>},
>/* Vector */
>{
> -COSTS_N_INSNS (1)  /* alu.  */
> +COSTS_N_INSNS (1), /* Alu.  */
> +COSTS_N_INSNS (1), /* dup_merge.  */
> +COSTS_N_INSNS (1), /* ld1_merge.  */
> +COSTS_N_INSNS (1)  /* ld1_dup.  */
>}
>  };
>
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index b38505b0872..dc4d3d39af8 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -10568,6 +10568,28 @@ cost_plus:
>  }
>break;
>
> +case VEC_DUPLICATE:
> +  if (!speed)
> +   return false;
> +
> +  if (GET_CODE (XEXP (x, 0)) == MEM)
> +   *cost += extra_cost->vect.ld1_dup;
> +  else
> +   *cost += extra_cost->vect.merge_dup;
> +  return true;
> +
> +case VEC_MERGE:
> +  if (speed && GET_CODE (XEXP (x, 0)) == VEC_DUPLICATE)
> +   {
> + if (GET_CODE (XEXP (XEXP (x, 0), 0)) == MEM)
> +   *cost += extra_cost->vect.ld1_merge;
> + else
> +   *cost += extra_cost->vect.merge_dup;
> + return true;
> +   }
> +  break;
> +
> +
>  case TRUNCATE:
>
>/* Decompose muldi3_highpart.  */
> diff --git a/gcc/config/arm/aarch-common-protos.h 
> b/gcc/config/arm/aarch-common-protos.h
> index 11cd5145bbc..dbc1282402a 100644
> --- a/gcc/config/arm/aarch-common-protos.h
> +++ b/gcc/config/arm/aarch-common-protos.h
> @@ -131,6 +131,9 @@ struct fp_cost_table
>  struct vector_cost_table
>  {
>const int alu;
> +  const int merge_dup;
> +  const int ld1_merge;
> +  const int ld1_dup;
>  };
>
>  struct cpu_cost_table
> diff --git a/gcc/config/arm/aarch-cost-tables.h 
> b/gcc/config/arm/aarch-cost-tables.h
> index bc33efadc6c..a51bc668f56 100644
> --- a/gcc/config/arm/aarch-cost-tables.h
> +++ b/gcc/config/arm/aarch-cost-tables.h
> @@ -121,7 +121,10 @@ const struct cpu_cost_table generic_extra_costs =
>},
>/* Vector */
>{
> -COSTS_N_INSNS (1)  /* alu.  */
> +COSTS_N_INSNS (1),  /* alu.  */
> +COSTS_N_INSNS (1), /* dup_merge.  */
> +COSTS_N_INSNS (1), /* ld1_merge.  */
> +COSTS_N_INSNS (1)  /* ld1_dup.  */
>}
>  };
>
> @@ -224,7