Re: [PING][PATCH, AArch64] Disable reg offset in quad-word store for Falkor

2018-01-17 Thread Siddhesh Poyarekar
Sorry, fixed a couple of typos that prevented the patch from actually
working.  Here's the updated version.  I'll be building on
ADDR_QUERY_STR for identifying and preventing pre/post incrementing
addresses for stores for falkor.

Siddhesh

2018-xx-xx  Jim Wilson  
Kugan Vivenakandarajah  
Siddhesh Poyarekar  

gcc/
* gcc/config/aarch64/aarch64-protos.h (aarch64_addr_query_type):
New member ADDR_QUERY_STR.
* gcc/config/aarch64/aarch64-tuning-flags.def
(SLOW_REGOFFSET_QUADWORD_STORE): New.
* gcc/config/aarch64/aarch64.c (qdf24xx_tunings): Add
SLOW_REGOFFSET_QUADWORD_STORE to tuning flags.
(aarch64_classify_address): Avoid register indexing for quad
mode stores when SLOW_REGOFFSET_QUADWORD_STORE is set.
* gcc/config/aarch64/constraints.md (Uts): New constraint.
* gcc/config/aarch64/aarch64.md (movti_aarch64, movtf_aarch64):
Use it.
* gcc/config/aarch64/aarch64-simd.md (aarch64_simd_mov):
Likewise.

gcc/testsuite/
* gcc/testsuite/gcc.target/aarch64/pr82533.c: New test case.

diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index 159bc6aee7e..15924fc3f58 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -120,6 +120,9 @@ enum aarch64_symbol_type
ADDR_QUERY_LDP_STP
   Query what is valid for a load/store pair.
 
+   ADDR_QUERY_STR
+  Query what is valid for a store.
+
ADDR_QUERY_ANY
   Query what is valid for at least one memory constraint, which may
   allow things that "m" doesn't.  For example, the SVE LDR and STR
@@ -128,6 +131,7 @@ enum aarch64_symbol_type
 enum aarch64_addr_query_type {
   ADDR_QUERY_M,
   ADDR_QUERY_LDP_STP,
+  ADDR_QUERY_STR,
   ADDR_QUERY_ANY
 };
 
diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 3d1f6a01cb7..48d92702723 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -131,9 +131,9 @@
 
 (define_insn "*aarch64_simd_mov"
   [(set (match_operand:VQ 0 "nonimmediate_operand"
-   "=w, Umq,  m,  w, ?r, ?w, ?r, w")
+   "=w, Umq, Uts,  w, ?r, ?w, ?r, w")
(match_operand:VQ 1 "general_operand"
-   "m,  Dz, w,  w,  w,  r,  r, Dn"))]
+   "m,  Dz,w,  w,  w,  r,  r, Dn"))]
   "TARGET_SIMD
&& (register_operand (operands[0], mode)
|| aarch64_simd_reg_or_zero (operands[1], mode))"
diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def 
b/gcc/config/aarch64/aarch64-tuning-flags.def
index ea9ead234cb..04baf5b6de6 100644
--- a/gcc/config/aarch64/aarch64-tuning-flags.def
+++ b/gcc/config/aarch64/aarch64-tuning-flags.def
@@ -41,4 +41,8 @@ AARCH64_EXTRA_TUNING_OPTION ("slow_unaligned_ldpw", 
SLOW_UNALIGNED_LDPW)
are not considered cheap.  */
 AARCH64_EXTRA_TUNING_OPTION ("cheap_shift_extend", CHEAP_SHIFT_EXTEND)
 
+/* Don't use a register offset in a memory address for a quad-word store.  */
+AARCH64_EXTRA_TUNING_OPTION ("slow_regoffset_quadword_store",
+SLOW_REGOFFSET_QUADWORD_STORE)
+
 #undef AARCH64_EXTRA_TUNING_OPTION
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 0599a79bfeb..664d4a18354 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -894,7 +894,7 @@ static const struct tune_params qdf24xx_tunings =
   2,   /* min_div_recip_mul_df.  */
   0,   /* max_case_values.  */
   tune_params::AUTOPREFETCHER_WEAK,/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE),   /* tune_flags.  */
+  (AARCH64_EXTRA_TUNE_SLOW_REGOFFSET_QUADWORD_STORE),  /* tune_flags.  */
   _prefetch_tune
 };
 
@@ -5531,6 +5531,16 @@ aarch64_classify_address (struct aarch64_address_info 
*info,
|| vec_flags == VEC_ADVSIMD
|| vec_flags == VEC_SVE_DATA));
 
+  /* Avoid register indexing for 128-bit stores when the
+ AARCH64_EXTRA_TUNE_SLOW_REGOFFSET_QUADWORD_STORE option is set.  */
+  if (!optimize_size
+  && type == ADDR_QUERY_STR
+  && (aarch64_tune_params.extra_tuning_flags
+ & AARCH64_EXTRA_TUNE_SLOW_REGOFFSET_QUADWORD_STORE)
+  && (mode == TImode || mode == TFmode
+ || aarch64_vector_data_mode_p (mode)))
+allow_reg_index_p = false;
+
   /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
  [Rn, #offset, MUL VL].  */
   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index edb6a758333..348b867ff7f 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1079,7 +1079,7 @@
 
 (define_insn "*movti_aarch64"
   [(set (match_operand:TI 0
-"nonimmediate_operand"  "=r, w,r,w,r,m,m,w,m")
+"nonimmediate_operand"  

Re: [PING][PATCH, AArch64] Disable reg offset in quad-word store for Falkor

2018-01-17 Thread Siddhesh Poyarekar
On Thursday 18 January 2018 01:11 AM, Jim Wilson wrote:
> This is the only solution I found that worked.

I tried a few things and ended up with pretty much the same fix you have
except with the check in a slightly different place.  That is, I used
aarch64_classify_address to gate the change because I intend to use that
to gate pre/post-increments for stores as well for falkor.

I also fixed up my submission to incorporate your more recent changes,
which was to add an extra tuning option instead of just checking for falkor.

Siddhesh

2018-xx-xx  Jim Wilson  
Kugan Vivenakandarajah  
Siddhesh Poyarekar  

gcc/
* gcc/config/aarch64/aarch64-protos.h (aarch64_addr_query_type):
New member ADDR_QUERY_STR.
* gcc/config/aarch64/aarch64-tuning-flags.def
(SLOW_REGOFFSET_QUADWORD_STORE): New.
* gcc/config/aarch64/aarch64.c (qdf24xx_tunings): Add
SLOW_REGOFFSET_QUADWORD_STORE to tuning flags.
(aarch64_classify_address): Avoid register indexing for quad
mode stores when SLOW_REGOFFSET_QUADWORD_STORE is set.
* gcc/config/aarch64/constraints.md (Uts): New constraint.
* gcc/config/aarch64/aarch64.md (movti_aarch64, movtf_aarch64):
Use it.
* gcc/config/aarch64/aarch64-simd.md (aarch64_simd_mov):
Likewise.

gcc/testsuite/
* gcc/testsuite/gcc.target/aarch64/pr82533.c: New test case.
diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index 159bc6aee7e..15924fc3f58 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -120,6 +120,9 @@ enum aarch64_symbol_type
ADDR_QUERY_LDP_STP
   Query what is valid for a load/store pair.
 
+   ADDR_QUERY_STR
+  Query what is valid for a store.
+
ADDR_QUERY_ANY
   Query what is valid for at least one memory constraint, which may
   allow things that "m" doesn't.  For example, the SVE LDR and STR
@@ -128,6 +131,7 @@ enum aarch64_symbol_type
 enum aarch64_addr_query_type {
   ADDR_QUERY_M,
   ADDR_QUERY_LDP_STP,
+  ADDR_QUERY_STR,
   ADDR_QUERY_ANY
 };
 
diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 3d1f6a01cb7..48d92702723 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -131,9 +131,9 @@
 
 (define_insn "*aarch64_simd_mov"
   [(set (match_operand:VQ 0 "nonimmediate_operand"
-   "=w, Umq,  m,  w, ?r, ?w, ?r, w")
+   "=w, Umq, Uts,  w, ?r, ?w, ?r, w")
(match_operand:VQ 1 "general_operand"
-   "m,  Dz, w,  w,  w,  r,  r, Dn"))]
+   "m,  Dz,w,  w,  w,  r,  r, Dn"))]
   "TARGET_SIMD
&& (register_operand (operands[0], mode)
|| aarch64_simd_reg_or_zero (operands[1], mode))"
diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def 
b/gcc/config/aarch64/aarch64-tuning-flags.def
index ea9ead234cb..04baf5b6de6 100644
--- a/gcc/config/aarch64/aarch64-tuning-flags.def
+++ b/gcc/config/aarch64/aarch64-tuning-flags.def
@@ -41,4 +41,8 @@ AARCH64_EXTRA_TUNING_OPTION ("slow_unaligned_ldpw", 
SLOW_UNALIGNED_LDPW)
are not considered cheap.  */
 AARCH64_EXTRA_TUNING_OPTION ("cheap_shift_extend", CHEAP_SHIFT_EXTEND)
 
+/* Don't use a register offset in a memory address for a quad-word store.  */
+AARCH64_EXTRA_TUNING_OPTION ("slow_regoffset_quadword_store",
+SLOW_REGOFFSET_QUADWORD_STORE)
+
 #undef AARCH64_EXTRA_TUNING_OPTION
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 0599a79bfeb..d17f3b70271 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -894,7 +894,7 @@ static const struct tune_params qdf24xx_tunings =
   2,   /* min_div_recip_mul_df.  */
   0,   /* max_case_values.  */
   tune_params::AUTOPREFETCHER_WEAK,/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE),   /* tune_flags.  */
+  (AARCH64_EXTRA_TUNE_SLOW_REGOFFSET_QUADWORD_STORE),  /* tune_flags.  */
   _prefetch_tune
 };
 
@@ -5531,6 +5531,16 @@ aarch64_classify_address (struct aarch64_address_info 
*info,
|| vec_flags == VEC_ADVSIMD
|| vec_flags == VEC_SVE_DATA));
 
+  /* Avoid register indexing for 128-bit stores when the
+ AARCH64_EXTRA_TUNE_SLOW_REGOFFSET_QUADWORD_STORE option is set.  */
+  if (!optimize_size
+  && type == ADDR_QUERY_STR
+  && (aarch64_tune_params.extra_tuning_flags
+ & AARCH64_EXTRA_TUNE_SLOW_REGOFFSET_QUADWORD_STORE)
+  && (mode == TImode || mode == TFmode
+ || aarch64_vector_data_mode_p (mode)))
+allow_reg_index_p = false;
+
   /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
  [Rn, #offset, MUL VL].  */
   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
diff --git a/gcc/config/aarch64/aarch64.md 

Re: [PING][PATCH, AArch64] Disable reg offset in quad-word store for Falkor

2018-01-17 Thread Jim Wilson

On 01/17/2018 05:37 AM, Wilco Dijkstra wrote:

In general I think the best way to achieve this would be to use the
existing cost models which are there for exactly this purpose. If
this doesn't work well enough then we should fix those. 


I tried using cost models, and this didn't work, because the costs don't 
allow us to distinguish between loads and stores.  If you mark reg+reg 
as expensive, you get a performance loss from losing the loads, and a 
performance gain from losing the stores, and they cancel each other out.



this patch disables a whole class of instructions for a specific
target rather than simply telling GCC that they are expensive and
should only be used if there is no cheaper alternative.


This is the only solution I found that worked.


Also there is potential impact on generic code from:

  (define_insn "*aarch64_simd_mov"
[(set (match_operand:VQ 0 "nonimmediate_operand"
-   "=w, Umq,  m,  w, ?r, ?w, ?r, w")
+   "=w, Umq, Utf,  w, ?r, ?w, ?r, w")
(match_operand:VQ 1 "general_operand"
-   "m,  Dz, w,  w,  w,  r,  r, Dn"))]
+   "m,  Dz,w,  w,  w,  r,  r, Dn"))]

It seems an 'm' constraint has special meaning in the register allocator,
using a different constraint can block certain simplifications (for example
merging stack offsets into load/store in the post-reload cleanup pass),
so we'd need to verify this doesn't cause regressions.


No optimizer should be checking for 'm'.  They should be checking for 
CT_MEMORY, which indicates a constraint that accepts memory.  Utf is 
properly marked as a memory constraint.


I did some testing to verify that the patch would not affect other 
aarch64 targets at the time, though I don't recall now exactly what I did.


Jim




Re: [PING][PATCH, AArch64] Disable reg offset in quad-word store for Falkor

2018-01-17 Thread Siddhesh Poyarekar
On Wednesday 17 January 2018 11:13 PM, Wilco Dijkstra wrote:
> Are you saying the same issue exists for all stores with writeback? If so then
> your patch would need to address that too.

Yes, I'll be posting a separate patch for that because the condition set
is slightly different for it.  It will also be accompanied with a
slightly different tuning for addrcost, which is why it needs separate
testing.

> It seems way more fundamental if it affects anything that isn't a simple
> immediate offset. Again I suggest using the existing cost infrastructure
> to find a setting that improves performance. If discouraging pre/post 
> increment
> helps Falkor then that's better than doing nothing.

The existing costs don't differentiate between loads and stores and that
is specifically what I need for falkor.

>>> I think a special case for Falkor in aarch64_address_cost would be 
>>> acceptable
>>> in GCC8 - that would be much smaller and cleaner than the current patch. 
>>> If required we could improve upon this in GCC9 and add a way to 
>>> differentiate
>>> between loads and stores.
>>
>> I can't do this in address_cost since I can't determine whether the
>> address is a load or a store location.  The most minimal way seems to be
>> using the patterns in the md file.
> 
> Well I don't think the approach of blocking specific patterns is a good 
> solution to
> this problem and may not be accepted by AArch64 maintainers. Try your example
> with -funroll-loops and compare with my suggestion (with or without extra 
> code to
> increase cost of writeback too). It seems to me adjusting costs is always 
> going to
> result in better overall code quality, even if it also applies to loads for 
> the time being.

Costs are not useful for this scenario because they cannot differentiate
between loads and stores.  To make that distinction I have to block
specific patterns, unless there's a better way I'm unaware of that helps
determine whether a memory reference is a load or a store.

Another approach I am trying to minimize the change is to add a new
ADDR_QUERY_STR for aarch64_legitimate_address_p, which can then be used
in classify_address to skip register addressing mode for falkor.  That
way we avoid the additional hook.  It will still need the additional Utf
memory constraint though.

Do you know of a way I can distinguish between loads and stores in costs
tuning?

Siddhesh


Re: [PING][PATCH, AArch64] Disable reg offset in quad-word store for Falkor

2018-01-17 Thread Wilco Dijkstra
Siddhesh Poyarekar wrote: 
On Wednesday 17 January 2018 08:31 PM, Wilco Dijkstra wrote:
> Why is that a bad thing? With the patch as is, the testcase generates:
> 
> .L4:
>    ldr q0, [x2, x3]
>    add x5, x1, x3
>    add x3, x3, 16
>    cmp x3, x4
>    str q0, [x5]
>    bne .L4
> 
> With a change in address cost (for loads and stores) we would get:
> 
> .L4:
>    ldr q0, [x3], 16
>    str q0, [x4], 16
>    cmp x3, x5
>    bne .L4

> This is great for the load because of the way the falkor prefetcher
> works, but it is terrible for the store because of the way the pipeline
> works.  The only performant store for falkor is an indirect load with a
> constant or zero offset.  Everything else has hidden costs.

Are you saying the same issue exists for all stores with writeback? If so then
your patch would need to address that too.

> I briefly looked at the possibility of splitting the register_offset
> cost into load and store, but I realized that I'd have to modify the
> target hook for it to be useful, which is way too much work for this
> single quirk.

It seems way more fundamental if it affects anything that isn't a simple
immediate offset. Again I suggest using the existing cost infrastructure
to find a setting that improves performance. If discouraging pre/post increment
helps Falkor then that's better than doing nothing.

>> I think a special case for Falkor in aarch64_address_cost would be acceptable
>> in GCC8 - that would be much smaller and cleaner than the current patch. 
>> If required we could improve upon this in GCC9 and add a way to differentiate
>> between loads and stores.
>
> I can't do this in address_cost since I can't determine whether the
> address is a load or a store location.  The most minimal way seems to be
> using the patterns in the md file.

Well I don't think the approach of blocking specific patterns is a good 
solution to
this problem and may not be accepted by AArch64 maintainers. Try your example
with -funroll-loops and compare with my suggestion (with or without extra code 
to
increase cost of writeback too). It seems to me adjusting costs is always going 
to
result in better overall code quality, even if it also applies to loads for the 
time being.

Wilco

Re: [PING][PATCH, AArch64] Disable reg offset in quad-word store for Falkor

2018-01-17 Thread Siddhesh Poyarekar
On Wednesday 17 January 2018 08:31 PM, Wilco Dijkstra wrote:
> Why is that a bad thing? With the patch as is, the testcase generates:
> 
> .L4:
>   ldr q0, [x2, x3]
>   add x5, x1, x3
>   add x3, x3, 16
>   cmp x3, x4
>   str q0, [x5]
>   bne .L4
> 
> With a change in address cost (for loads and stores) we would get:
> 
> .L4:
>   ldr q0, [x3], 16
>   str q0, [x4], 16
>   cmp x3, x5
>   bne .L4
> 
> This looks better to me, especially if there are more loads and stores and
> some have offsets as well (the writeback is once per stream while the extra
> add happens for every store). It may be worth trying both possibilities
> on a large body of code and see which comes out smallest/fastest.

This is great for the load because of the way the falkor prefetcher
works, but it is terrible for the store because of the way the pipeline
works.  The only performant store for falkor is an indirect load with a
constant or zero offset.  Everything else has hidden costs.

> Note using the cost model as intended means the compiler tries to use the
> lowest cost possibility rather than never emitting the instruction, not even
> when optimizing for size. I think it's wrong to always block a valid 
> instruction.

> It's not clear whether it is easy to split out the costs today (it could be 
> done
> in aarch64_rtx_costs but not aarch64_address_cost, and the latter is what
> IVOpt uses).

I briefly looked at the possibility of splitting the register_offset
cost into load and store, but I realized that I'd have to modify the
target hook for it to be useful, which is way too much work for this
single quirk.

>> Further, it seems like worthwhile work only if there are other parts
>> that actually have the same quirk and can use this split.  Do you know
>> of any such cores?
> 
> Currently there are several supported CPUs which use a much higher cost
> for TImode and for register offsets. So it's a common thing to want, however
> I don't know whether splitting load/store address costs helps for those.

It wouldn't.  This ought to be expressed already using the addr_scale_costs.

> I think a special case for Falkor in aarch64_address_cost would be acceptable
> in GCC8 - that would be much smaller and cleaner than the current patch. 
> If required we could improve upon this in GCC9 and add a way to differentiate
> between loads and stores.

I can't do this in address_cost since I can't determine whether the
address is a load or a store location.  The most minimal way seems to be
using the patterns in the md file.

Siddhesh


Re: [PING][PATCH, AArch64] Disable reg offset in quad-word store for Falkor

2018-01-17 Thread Wilco Dijkstra
Siddhesh Poyarekar wrote:
  
> The current cost model will disable reg offset for loads as well as
> stores, which doesn't work well since loads with reg offset are faster
> for falkor.

Why is that a bad thing? With the patch as is, the testcase generates:

.L4:
ldr q0, [x2, x3]
add x5, x1, x3
add x3, x3, 16
cmp x3, x4
str q0, [x5]
bne .L4

With a change in address cost (for loads and stores) we would get:

.L4:
ldr q0, [x3], 16
str q0, [x4], 16
cmp x3, x5
bne .L4

This looks better to me, especially if there are more loads and stores and
some have offsets as well (the writeback is once per stream while the extra
add happens for every store). It may be worth trying both possibilities
on a large body of code and see which comes out smallest/fastest.

Note using the cost model as intended means the compiler tries to use the
lowest cost possibility rather than never emitting the instruction, not even
when optimizing for size. I think it's wrong to always block a valid 
instruction.

> Also, this is a very specific tweak for a specific processor, i.e. I
> don't know if there is value in splitting out the costs into loads and
> stores and further into 128-bit and lower just to set the 128 store cost
> higher.  That will increase the size of the change by quite a bit and
> may not make it suitable for inclusion into gcc8 at this stage, while
> the current one still qualifies given its contained impact.

It's not clear whether it is easy to split out the costs today (it could be done
in aarch64_rtx_costs but not aarch64_address_cost, and the latter is what
IVOpt uses).

> Further, it seems like worthwhile work only if there are other parts
> that actually have the same quirk and can use this split.  Do you know
> of any such cores?

Currently there are several supported CPUs which use a much higher cost
for TImode and for register offsets. So it's a common thing to want, however
I don't know whether splitting load/store address costs helps for those.

I think a special case for Falkor in aarch64_address_cost would be acceptable
in GCC8 - that would be much smaller and cleaner than the current patch. 
If required we could improve upon this in GCC9 and add a way to differentiate
between loads and stores.

Wilco

Re: [PING][PATCH, AArch64] Disable reg offset in quad-word store for Falkor

2018-01-17 Thread Siddhesh Poyarekar
On Wednesday 17 January 2018 07:07 PM, Wilco Dijkstra wrote:
> (finished version this time, somehow Outlook loves to send emails early...)
> 
> Hi,
> 
> In general I think the best way to achieve this would be to use the
> existing cost models which are there for exactly this purpose. If
> this doesn't work well enough then we should fix those. As is,
> this patch disables a whole class of instructions for a specific
> target rather than simply telling GCC that they are expensive and
> should only be used if there is no cheaper alternative.

The current cost model will disable reg offset for loads as well as
stores, which doesn't work well since loads with reg offset are faster
for falkor.

Also, this is a very specific tweak for a specific processor, i.e. I
don't know if there is value in splitting out the costs into loads and
stores and further into 128-bit and lower just to set the 128 store cost
higher.  That will increase the size of the change by quite a bit and
may not make it suitable for inclusion into gcc8 at this stage, while
the current one still qualifies given its contained impact.

Further, it seems like worthwhile work only if there are other parts
that actually have the same quirk and can use this split.  Do you know
of any such cores?

> Also there is potential impact on generic code from:
> 
>  (define_insn "*aarch64_simd_mov"
>[(set (match_operand:VQ 0 "nonimmediate_operand"
> - "=w, Umq,  m,  w, ?r, ?w, ?r, w")
> + "=w, Umq, Utf,  w, ?r, ?w, ?r, w")
>   (match_operand:VQ 1 "general_operand"
> - "m,  Dz, w,  w,  w,  r,  r, Dn"))]
> + "m,  Dz,w,  w,  w,  r,  r, Dn"))]
> 
> It seems an 'm' constraint has special meaning in the register allocator,
> using a different constraint can block certain simplifications (for example
> merging stack offsets into load/store in the post-reload cleanup pass),
> so we'd need to verify this doesn't cause regressions.

I'll verify this.

> Also it is best to introduce generic interfaces:
> 
> +/* Return TRUE if OP is a good address mode for movti target on falkor.  */
> +bool
> +aarch64_falkor_movti_target_operand_p (rtx op)
> 
> +(define_memory_constraint "Utf"
> +  "@iternal
> +   A good address for a falkor movti target operand."
> +  (and (match_code "mem")
> +   (match_test "aarch64_falkor_movti_target_operand_p (op)")))
> 
> We should use generic names here even if the current implementation
> wants to do something specific for Falkor.

I'll fix this.

Thanks,
Siddhesh


Re: [PING][PATCH, AArch64] Disable reg offset in quad-word store for Falkor

2018-01-17 Thread Wilco Dijkstra
(finished version this time, somehow Outlook loves to send emails early...)

Hi,

In general I think the best way to achieve this would be to use the
existing cost models which are there for exactly this purpose. If
this doesn't work well enough then we should fix those. As is,
this patch disables a whole class of instructions for a specific
target rather than simply telling GCC that they are expensive and
should only be used if there is no cheaper alternative.

Also there is potential impact on generic code from:

 (define_insn "*aarch64_simd_mov"
   [(set (match_operand:VQ 0 "nonimmediate_operand"
-   "=w, Umq,  m,  w, ?r, ?w, ?r, w")
+   "=w, Umq, Utf,  w, ?r, ?w, ?r, w")
(match_operand:VQ 1 "general_operand"
-   "m,  Dz, w,  w,  w,  r,  r, Dn"))]
+   "m,  Dz,w,  w,  w,  r,  r, Dn"))]

It seems an 'm' constraint has special meaning in the register allocator,
using a different constraint can block certain simplifications (for example
merging stack offsets into load/store in the post-reload cleanup pass),
so we'd need to verify this doesn't cause regressions.

Also it is best to introduce generic interfaces:

+/* Return TRUE if OP is a good address mode for movti target on falkor.  */
+bool
+aarch64_falkor_movti_target_operand_p (rtx op)

+(define_memory_constraint "Utf"
+  "@iternal
+   A good address for a falkor movti target operand."
+  (and (match_code "mem")
+   (match_test "aarch64_falkor_movti_target_operand_p (op)")))

We should use generic names here even if the current implementation
wants to do something specific for Falkor.

Wilco

Re: [PING][PATCH, AArch64] Disable reg offset in quad-word store for Falkor

2018-01-17 Thread Wilco Dijkstra
Hi,

In general I think the best way to achieve this would be to use the
existing cost models which are there for exactly this purpose. If
this doesn't work well enough then we should fix those. As is,
this patch disables a whole class of instructions for a specific
target rather than simply telling GCC that they are expensive and
should only be used if there is no cheaper alternative.

Also there is impact on generic code from:



[PING][PATCH, AArch64] Disable reg offset in quad-word store for Falkor

2018-01-17 Thread Siddhesh Poyarekar
From: Siddhesh Poyarekar 

Hi,

Jim Wilson posted a patch for this in September[1] and it appears
following discussions that the patch was an acceptable fix for falkor.
Kugan followed up[2] with a test case since that was requested during
initial review.  Jim has moved on from Linaro, so I'm pinging this patch
with the hope that it is OK for inclusion since it was posted before the
freeze and is also isolated in impact to just falkor.

Siddhesh

[1] https://gcc.gnu.org/ml/gcc-patches/2017-09/msg01547.html
[2] https://gcc.gnu.org/ml/gcc-patches/2017-11/msg00050.html

On Falkor, because of an idiosyncracy of how the pipelines are designed, a
quad-word store using a reg+reg addressing mode is almost twice as slow as an
add followed by a quad-word store with a single reg addressing mode.  So we
get better performance if we disallow addressing modes using register offsets
with quad-word stores.

Using lmbench compiled with -O2 -ftree-vectorize as my benchmark, I see a 13%
performance increase on stream copy using this patch, and a 16% performance
increase on stream scale using this patch.  I also see a small performance
increase on SPEC CPU2006 of around 0.2% for int and 0.4% for FP at -O3.

2018-01-17  Jim Wilson  
Kugan Vivekanandarajah  

gcc/
* config/aarch64/aarch64-protos.h 
(aarch64_falkor_movti_target_operand_p): Declare.
constraint instead of m.
* config/aarch64/aarch64.c (aarch64_falkor_movti_target_operand_p): New.
* config/aarch64/constraints.md (Utf): New.
* config/aarch64/aarch64.md (movti_aarch64): Use Utf constraint instead
of m.
(movtf_aarch64): Likewise.
* config/aarch64/aarch64-simd.md (aarch64_simd_mov): Use Utf

gcc/testsuite/
* gcc/testsuite/gcc.target/aarch64/pr82533.c: New test case.

---
 gcc/config/aarch64/aarch64-protos.h|  1 +
 gcc/config/aarch64/aarch64-simd.md |  4 ++--
 gcc/config/aarch64/aarch64.c   | 10 ++
 gcc/config/aarch64/aarch64.md  |  6 +++---
 gcc/config/aarch64/constraints.md  |  6 ++
 gcc/testsuite/gcc.target/aarch64/pr82533.c | 11 +++
 6 files changed, 33 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/pr82533.c

diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index 2d705d2..088d864 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -433,6 +433,7 @@ bool aarch64_simd_mem_operand_p (rtx);
 bool aarch64_sve_ld1r_operand_p (rtx);
 bool aarch64_sve_ldr_operand_p (rtx);
 bool aarch64_sve_struct_memory_operand_p (rtx);
+bool aarch64_falkor_movti_target_operand_p (rtx);
 rtx aarch64_simd_vect_par_cnst_half (machine_mode, int, bool);
 rtx aarch64_tls_get_addr (void);
 tree aarch64_fold_builtin (tree, int, tree *, bool);
diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 3d1f6a0..f7daac3 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -131,9 +131,9 @@
 
 (define_insn "*aarch64_simd_mov"
   [(set (match_operand:VQ 0 "nonimmediate_operand"
-   "=w, Umq,  m,  w, ?r, ?w, ?r, w")
+   "=w, Umq, Utf,  w, ?r, ?w, ?r, w")
(match_operand:VQ 1 "general_operand"
-   "m,  Dz, w,  w,  w,  r,  r, Dn"))]
+   "m,  Dz,w,  w,  w,  r,  r, Dn"))]
   "TARGET_SIMD
&& (register_operand (operands[0], mode)
|| aarch64_simd_reg_or_zero (operands[1], mode))"
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 2e70f3a..0db7a4f 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -13477,6 +13477,16 @@ aarch64_sve_struct_memory_operand_p (rtx op)
  && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
 }
 
+/* Return TRUE if OP is a good address mode for movti target on falkor.  */
+bool
+aarch64_falkor_movti_target_operand_p (rtx op)
+{
+  if ((enum attr_tune) aarch64_tune == TUNE_FALKOR)
+return MEM_P (op) && ! (GET_CODE (XEXP (op, 0)) == PLUS
+   && ! CONST_INT_P (XEXP (XEXP (op, 0), 1)));
+  return MEM_P (op);
+}
+
 /* Emit a register copy from operand to operand, taking care not to
early-clobber source registers in the process.
 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index edb6a75..696fd12 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1079,7 +1079,7 @@
 
 (define_insn "*movti_aarch64"
   [(set (match_operand:TI 0
-"nonimmediate_operand"  "=r, w,r,w,r,m,m,w,m")
+"nonimmediate_operand"  "=r, w,r,w,r,m,m,w,Utf")
(match_operand:TI 1
 "aarch64_movti_operand" " rn,r,w,w,m,r,Z,m,w"))]
   "(register_operand (operands[0], TImode)
@@ -1226,9 +1226,9 @@
 
 (define_insn "*movtf_aarch64"
   [(set