[PATCH, rs6000] Do not enable pcrel-opt by default

2021-06-22 Thread Aaron Sawdey via Gcc-patches
SPEC2017 testing on p10 shows that this optimization does not have a
positive impact on performance. So we are no longer going to enable it
by default. The test cases for it needed to be updated so they always
enable it to test it.

OK for trunk and backport to 11 if bootstrap/regtest passes?

Thanks!
   Aaron

gcc/

* config/rs6000/rs6000-cpus.def: Take OPTION_MASK_PCREL_OPT out
 of OTHER_POWER10_MASKS so it will not be enabled by default.

gcc/testsuite/

* gcc.target/powerpc/pcrel-opt-inc-di.c: Enable -mpcrel-opt to test it.
* gcc.target/powerpc/pcrel-opt-ld-df.c: Enable -mpcrel-opt to test it.
* gcc.target/powerpc/pcrel-opt-ld-di.c: Enable -mpcrel-opt to test it.
* gcc.target/powerpc/pcrel-opt-ld-hi.c: Enable -mpcrel-opt to test it.
* gcc.target/powerpc/pcrel-opt-ld-qi.c: Enable -mpcrel-opt to test it.
* gcc.target/powerpc/pcrel-opt-ld-sf.c: Enable -mpcrel-opt to test it.
* gcc.target/powerpc/pcrel-opt-ld-si.c: Enable -mpcrel-opt to test it.
* gcc.target/powerpc/pcrel-opt-ld-vector.c: Enable -mpcrel-opt to
test it.
* gcc.target/powerpc/pcrel-opt-st-df.c: Enable -mpcrel-opt to test it.
* gcc.target/powerpc/pcrel-opt-st-di.c: Enable -mpcrel-opt to test it.
* gcc.target/powerpc/pcrel-opt-st-hi.c: Enable -mpcrel-opt to test it.
* gcc.target/powerpc/pcrel-opt-st-qi.c: Enable -mpcrel-opt to test it.
* gcc.target/powerpc/pcrel-opt-st-sf.c: Enable -mpcrel-opt to test it.
* gcc.target/powerpc/pcrel-opt-st-si.c: Enable -mpcrel-opt to test it.
* gcc.target/powerpc/pcrel-opt-st-vector.c: Enable -mpcrel-opt to
test it.
---
 gcc/config/rs6000/rs6000-cpus.def  | 3 ++-
 gcc/testsuite/gcc.target/powerpc/pcrel-opt-inc-di.c| 2 +-
 gcc/testsuite/gcc.target/powerpc/pcrel-opt-ld-df.c | 2 +-
 gcc/testsuite/gcc.target/powerpc/pcrel-opt-ld-di.c | 2 +-
 gcc/testsuite/gcc.target/powerpc/pcrel-opt-ld-hi.c | 2 +-
 gcc/testsuite/gcc.target/powerpc/pcrel-opt-ld-qi.c | 2 +-
 gcc/testsuite/gcc.target/powerpc/pcrel-opt-ld-sf.c | 2 +-
 gcc/testsuite/gcc.target/powerpc/pcrel-opt-ld-si.c | 2 +-
 gcc/testsuite/gcc.target/powerpc/pcrel-opt-ld-vector.c | 2 +-
 gcc/testsuite/gcc.target/powerpc/pcrel-opt-st-df.c | 2 +-
 gcc/testsuite/gcc.target/powerpc/pcrel-opt-st-di.c | 2 +-
 gcc/testsuite/gcc.target/powerpc/pcrel-opt-st-hi.c | 2 +-
 gcc/testsuite/gcc.target/powerpc/pcrel-opt-st-qi.c | 2 +-
 gcc/testsuite/gcc.target/powerpc/pcrel-opt-st-sf.c | 2 +-
 gcc/testsuite/gcc.target/powerpc/pcrel-opt-st-si.c | 2 +-
 gcc/testsuite/gcc.target/powerpc/pcrel-opt-st-vector.c | 2 +-
 16 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/gcc/config/rs6000/rs6000-cpus.def 
b/gcc/config/rs6000/rs6000-cpus.def
index 52ce84835f7..1e8c9a68c3f 100644
--- a/gcc/config/rs6000/rs6000-cpus.def
+++ b/gcc/config/rs6000/rs6000-cpus.def
@@ -75,9 +75,10 @@
 | OPTION_MASK_P9_VECTOR)
 
 /* Flags that need to be turned off if -mno-power10.  */
+/* PCREL_OPT is now disabled by default so we comment it out here.  */
 #define OTHER_POWER10_MASKS(OPTION_MASK_MMA\
 | OPTION_MASK_PCREL\
-| OPTION_MASK_PCREL_OPT\
+/* | OPTION_MASK_PCREL_OPT */  \
 | OPTION_MASK_PREFIXED)
 
 #define ISA_3_1_MASKS_SERVER   (ISA_3_0_MASKS_SERVER   \
diff --git a/gcc/testsuite/gcc.target/powerpc/pcrel-opt-inc-di.c 
b/gcc/testsuite/gcc.target/powerpc/pcrel-opt-inc-di.c
index c82041c9dc6..6272f5c72c3 100644
--- a/gcc/testsuite/gcc.target/powerpc/pcrel-opt-inc-di.c
+++ b/gcc/testsuite/gcc.target/powerpc/pcrel-opt-inc-di.c
@@ -1,6 +1,6 @@
 /* { dg-do compile } */
 /* { dg-require-effective-target powerpc_pcrel } */
-/* { dg-options "-O2 -mdejagnu-cpu=power10" } */
+/* { dg-options "-O2 -mdejagnu-cpu=power10 -mpcrel-opt" } */
 
 #define TYPE   unsigned int
 
diff --git a/gcc/testsuite/gcc.target/powerpc/pcrel-opt-ld-df.c 
b/gcc/testsuite/gcc.target/powerpc/pcrel-opt-ld-df.c
index d35862fcb6e..0dcab311add 100644
--- a/gcc/testsuite/gcc.target/powerpc/pcrel-opt-ld-df.c
+++ b/gcc/testsuite/gcc.target/powerpc/pcrel-opt-ld-df.c
@@ -1,6 +1,6 @@
 /* { dg-do compile } */
 /* { dg-require-effective-target powerpc_pcrel } */
-/* { dg-options "-O2 -mdejagnu-cpu=power10" } */
+/* { dg-options "-O2 -mdejagnu-cpu=power10 -mpcrel-opt" } */
 
 #define TYPE   double
 #define LARGE  0x2
diff --git a/gcc/testsuite/gcc.target/powerpc/pcrel-opt-ld-di.c 
b/gcc/testsuite/gcc.target/powerpc/pcrel-opt-ld-di.c
index 7e1ff99f20e..95b60f3b151 100644
--- a/gcc/testsuite/gcc.target/powerpc/pcrel-opt-ld-di.c
+++ b/gcc/testsuite/gcc.target/powerpc/pcrel-opt-ld-di.c
@@ -1,6 +1,6 @@
 /* { dg-do compile } */
 /* { dg-require-effective-target powerpc_pcrel } */

[PATCH,rs6000] Fix p10 fusion regtests

2021-06-18 Thread Aaron Sawdey via Gcc-patches
From: Aaron Sawdey 

Update the count of matches for the fusion combine patterns after
the recent changes to them.  At Segher's request, used \m and \M
in the match patterns. Also I have grouped together all alternatives of
each fusion insn, which should hopefully make this test a little less
fragile.

OK for trunk and backport to 11?

Thanks!
   Aaron

gcc/testsuite/ChangeLog

* gcc.target/powerpc/fusion-p10-2logical.c: Update pattern
match counts.
* gcc.target/powerpc/fusion-p10-addadd.c: Update pattern match
counts.
* gcc.target/powerpc/fusion-p10-ldcmpi.c: Update pattern match
counts.
* gcc.target/powerpc/fusion-p10-logadd.c: Update pattern match
counts.
---
 .../gcc.target/powerpc/fusion-p10-2logical.c   | 401 -
 .../gcc.target/powerpc/fusion-p10-addadd.c |  17 +-
 .../gcc.target/powerpc/fusion-p10-ldcmpi.c |  10 +-
 .../gcc.target/powerpc/fusion-p10-logadd.c | 129 ++-
 4 files changed, 194 insertions(+), 363 deletions(-)

diff --git a/gcc/testsuite/gcc.target/powerpc/fusion-p10-2logical.c 
b/gcc/testsuite/gcc.target/powerpc/fusion-p10-2logical.c
index de22176..009a5f2 100644
--- a/gcc/testsuite/gcc.target/powerpc/fusion-p10-2logical.c
+++ b/gcc/testsuite/gcc.target/powerpc/fusion-p10-2logical.c
@@ -64,262 +64,151 @@ TEST(vboolchar_t);
 TEST(vuint_t);
 
 /* Recreate with:
-   grep ' \*fuse_' fusion-p10-2logical.s|sed -e 's,^.*\*,,' |sort -k 7,7 |uniq 
-c|awk '{l=30-length($2); printf("/%s* { %s { scan-assembler-times \"%s\"%-*s   
 %4d { target lp64 } } } *%s/\n","","dg-final",$2,l,"",$1,"");}'
+   grep ' \*fuse_' fusion-p10-2logical.s|sed -e 's,^.*\*,,' -e 's,/[0-9],/,' 
|sort -k 7,7 |uniq -c|awk '{l=30-length($2); printf("/%s* { %s { 
scan-assembler-times {\\m%s\\M}%-*s%4d { target lp64 } } } 
*%s/\n","","dg-final",$2,l,"",$1,"");}'
  */
-  
-/* { dg-final { scan-assembler-times "fuse_and_and/1"  
16 { target lp64 } } } */
-/* { dg-final { scan-assembler-times "fuse_and_and/2"  
16 { target lp64 } } } */
-/* { dg-final { scan-assembler-times "fuse_andc_and/0" 
16 { target lp64 } } } */
-/* { dg-final { scan-assembler-times "fuse_andc_and/1" 
26 { target lp64 } } } */
-/* { dg-final { scan-assembler-times "fuse_andc_and/2" 
48 { target lp64 } } } */
-/* { dg-final { scan-assembler-times "fuse_andc_and/3" 
 6 { target lp64 } } } */
-/* { dg-final { scan-assembler-times "fuse_andc_or/0"  
16 { target lp64 } } } */
-/* { dg-final { scan-assembler-times "fuse_andc_or/1"  
16 { target lp64 } } } */
-/* { dg-final { scan-assembler-times "fuse_andc_or/2"  
32 { target lp64 } } } */
-/* { dg-final { scan-assembler-times "fuse_andc_orc/0" 
 8 { target lp64 } } } */
-/* { dg-final { scan-assembler-times "fuse_andc_orc/1" 
 8 { target lp64 } } } */
-/* { dg-final { scan-assembler-times "fuse_andc_orc/2" 
48 { target lp64 } } } */
-/* { dg-final { scan-assembler-times "fuse_andc_xor/0" 
16 { target lp64 } } } */
-/* { dg-final { scan-assembler-times "fuse_andc_xor/1" 
16 { target lp64 } } } */
-/* { dg-final { scan-assembler-times "fuse_andc_xor/2" 
32 { target lp64 } } } */
-/* { dg-final { scan-assembler-times "fuse_and_eqv/0"  
 8 { target lp64 } } } */
-/* { dg-final { scan-assembler-times "fuse_and_eqv/2"  
24 { target lp64 } } } */
-/* { dg-final { scan-assembler-times "fuse_and_or/0"   
16 { target lp64 } } } */
-/* { dg-final { scan-assembler-times "fuse_and_or/2"   
16 { target lp64 } } } */
-/* { dg-final { scan-assembler-times "fuse_and_orc/0"  
 8 { target lp64 } } } */
-/* { dg-final { scan-assembler-times "fuse_and_orc/2"  
24 { target lp64 } } } */
-/* { dg-final { scan-assembler-times "fuse_and_xor/0"  
16 { target lp64 } } } */
-/* { dg-final { scan-assembler-times "fuse_and_xor/2"  
16 { target lp64 } } } */
-/* { dg-final { scan-assembler-times "fuse_eqv_and/0"  
16 { target lp64 } } } */
-/* { dg-final { scan-assembler-times "fuse_eqv_and/2"  
16 { target lp64 } } } */
-/* { dg-final { scan-assembler-times "fuse_eqv_

[PATCH] Add needed earlyclobber to fusion patterns

2021-06-16 Thread Aaron Sawdey via Gcc-patches
The add-logical and add-add fusion patterns all have constraint
alternatives "=0,1,,r" for the output (3). The inputs 0 and 1
are used in the first fusion instruction and then either may be
reused as a temp for the output of the first insn which is
input to the second. However, if input 2 is the same as 0 or 1,
it gets clobbered unexpectedly. So the first 2 alts need to be
"=&0,&1,,r" instead to indicate that in alts 0 and 1, the
register used for 3 is earlyclobber, hence can't be the same as
input 2.

This was actually encountered in the backport of the add-logical
fusion patch to gcc-11. Some code in go hit this case:

   :andc r30,r30,r9
r30 now (~(x|((x)+c)))&(~c) --> this is new x
   :b 
   :addi r31,r31,-1
r31 now m-1
   :srd r31,r30,r31
r31 now x>>(m-1)
   :subf r30,r31,r30
r30 now x-(x>>(m-1))
   :or r30,r30,r30   # mdoom
nop
   :not r3,r30
r3 now ~(x-(x>>(m-1))) -- WHOOPS

The or r30,r30,r30 was meant to be or-ing in the earlier value
of r30 which was overwritten by the output of the subf.

OK for trunk and backport to 11 if bootstrap and regtest pass?

Separately I will be updating the fusion regtests because this
change has again shifted which pattern alternatives get used 
and how many times.

Thanks!
Aaron

gcc/ChangeLog

* config/rs6000/genfusion.pl (gen_logical_addsubf): Add
earlyclobber to alts 0/1.
(gen_addadd): Add earlyclobber to alts 0/1.
* config/rs6000/fusion.md: Regenerate file.
---
 gcc/config/rs6000/fusion.md| 300 -
 gcc/config/rs6000/genfusion.pl |   4 +-
 2 files changed, 152 insertions(+), 152 deletions(-)

diff --git a/gcc/config/rs6000/fusion.md b/gcc/config/rs6000/fusion.md
index e642ff5f95f..516baa0bb0b 100644
--- a/gcc/config/rs6000/fusion.md
+++ b/gcc/config/rs6000/fusion.md
@@ -358,7 +358,7 @@ (define_insn_and_split "*lbz_cmpldi_cr0_QI_GPR_CCUNS_zero"
 ;; logical-logical fusion pattern generated by gen_logical_addsubf
 ;; scalar and -> and
 (define_insn "*fuse_and_and"
-  [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,,r")
+  [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,,r")
 (and:GPR (and:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")
   (match_operand:GPR 1 "gpc_reg_operand" "%r,r,r,r"))
  (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))
@@ -376,7 +376,7 @@ (define_insn "*fuse_and_and"
 ;; logical-logical fusion pattern generated by gen_logical_addsubf
 ;; scalar andc -> and
 (define_insn "*fuse_andc_and"
-  [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,,r")
+  [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,,r")
 (and:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" 
"r,r,r,r"))
   (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))
  (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))
@@ -394,7 +394,7 @@ (define_insn "*fuse_andc_and"
 ;; logical-logical fusion pattern generated by gen_logical_addsubf
 ;; scalar eqv -> and
 (define_insn "*fuse_eqv_and"
-  [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,,r")
+  [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,,r")
 (and:GPR (not:GPR (xor:GPR (match_operand:GPR 0 "gpc_reg_operand" 
"r,r,r,r")
   (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))
  (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))
@@ -412,7 +412,7 @@ (define_insn "*fuse_eqv_and"
 ;; logical-logical fusion pattern generated by gen_logical_addsubf
 ;; scalar nand -> and
 (define_insn "*fuse_nand_and"
-  [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,,r")
+  [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,,r")
 (and:GPR (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" 
"r,r,r,r"))
   (not:GPR (match_operand:GPR 1 "gpc_reg_operand" 
"r,r,r,r")))
  (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))
@@ -430,7 +430,7 @@ (define_insn "*fuse_nand_and"
 ;; logical-logical fusion pattern generated by gen_logical_addsubf
 ;; scalar nor -> and
 (define_insn "*fuse_nor_and"
-  [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,,r")
+  [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,,r")
 (and:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" 
"r,r,r,r"))
   (not:GPR (match_operand:GPR 1 "gpc_reg_operand" 
"r,r,r,r")))
  (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))
@@ -448,7 +448,7 @@ (define_insn "*fuse_nor_and"
 ;; logical-logical fusion pattern generated by gen_logical_addsubf
 ;; scalar or -> and
 (define_insn "*fuse_or_and"
-  [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,,r")
+  [(set (match_operand:GPR 3 "gpc_reg_operand" "=&0,&1,,r")
 (and:GPR (ior:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")
   (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))
  

[PATCH,rs6000] Do not check if SMS succeeds on powerpc

2021-06-11 Thread Aaron Sawdey via Gcc-patches
These tests have become unstable and SMS either succeeds or doesn't
depending on things like changes in instruction latency. Removing
the scan-rtl-dump-times checks for powerpc*-*-*.

If bootstrap/regtest is passes, ok for trunk and backport to 11?

Thanks!
   Aaron

gcc/testsuite

* gcc.dg/sms-1.c: Remove scan-rtl-dump-times check.
* gcc.dg/sms-2.c: Remove scan-rtl-dump-times check.
* gcc.dg/sms-3.c: Remove scan-rtl-dump-times check.
* gcc.dg/sms-4.c: Remove scan-rtl-dump-times check.
* gcc.dg/sms-6.c: Remove scan-rtl-dump-times check.
* gcc.dg/sms-8.c: Remove scan-rtl-dump-times check.
* gcc.dg/sms-10.c: Remove scan-rtl-dump-times check.
---
 gcc/testsuite/gcc.dg/sms-1.c  | 2 --
 gcc/testsuite/gcc.dg/sms-10.c | 3 ---
 gcc/testsuite/gcc.dg/sms-2.c  | 2 --
 gcc/testsuite/gcc.dg/sms-3.c  | 3 ---
 gcc/testsuite/gcc.dg/sms-4.c  | 3 ---
 gcc/testsuite/gcc.dg/sms-6.c  | 2 --
 gcc/testsuite/gcc.dg/sms-8.c  | 4 
 7 files changed, 19 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/sms-1.c b/gcc/testsuite/gcc.dg/sms-1.c
index 26868c34c71..098e1aa6e45 100644
--- a/gcc/testsuite/gcc.dg/sms-1.c
+++ b/gcc/testsuite/gcc.dg/sms-1.c
@@ -40,5 +40,3 @@ main ()
   return 0;
 }
 
-/* { dg-final { scan-rtl-dump-times "SMS succeeded" 1 "sms"  { target 
powerpc*-*-* } } } */
-
diff --git a/gcc/testsuite/gcc.dg/sms-10.c b/gcc/testsuite/gcc.dg/sms-10.c
index d85e8e2a274..df3bba24ed0 100644
--- a/gcc/testsuite/gcc.dg/sms-10.c
+++ b/gcc/testsuite/gcc.dg/sms-10.c
@@ -113,6 +113,3 @@ main ()
 
   return 0;
 }
-
-/* { dg-final { scan-rtl-dump-times "SMS succeeded" 1 "sms" { target 
powerpc*-*-* } } } */
-
diff --git a/gcc/testsuite/gcc.dg/sms-2.c b/gcc/testsuite/gcc.dg/sms-2.c
index 7b96f550262..f8375f9f05d 100644
--- a/gcc/testsuite/gcc.dg/sms-2.c
+++ b/gcc/testsuite/gcc.dg/sms-2.c
@@ -31,5 +31,3 @@ fun (nb)
  sy = 0;
   }
 }
-
-/* { dg-final { scan-rtl-dump-times "SMS loop many exits" 1 "sms" { target 
powerpc*-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/sms-3.c b/gcc/testsuite/gcc.dg/sms-3.c
index 822b516af2f..5e56ecf761c 100644
--- a/gcc/testsuite/gcc.dg/sms-3.c
+++ b/gcc/testsuite/gcc.dg/sms-3.c
@@ -38,6 +38,3 @@ main ()
   foo (6, 3);
   return 0;
 }
-
-/* { dg-final { scan-rtl-dump-times "SMS succeeded" 1 "sms" { target 
powerpc*-*-* } } } */
-
diff --git a/gcc/testsuite/gcc.dg/sms-4.c b/gcc/testsuite/gcc.dg/sms-4.c
index f5ebb55a2f4..8416b8b9ce9 100644
--- a/gcc/testsuite/gcc.dg/sms-4.c
+++ b/gcc/testsuite/gcc.dg/sms-4.c
@@ -34,6 +34,3 @@ main ()
   foo (5, a, b, c, dst);
   return 0;
 }
-
-/* { dg-final { scan-rtl-dump-times "SMS succeeded" 1 "sms" { target 
powerpc*-*-* } } } */
-
diff --git a/gcc/testsuite/gcc.dg/sms-6.c b/gcc/testsuite/gcc.dg/sms-6.c
index e57e01539eb..d6fa45a2cf9 100644
--- a/gcc/testsuite/gcc.dg/sms-6.c
+++ b/gcc/testsuite/gcc.dg/sms-6.c
@@ -41,5 +41,3 @@ int main()
   
   return 0;
 }
-
-/* { dg-final { scan-rtl-dump-times "SMS succeeded" 3 "sms" { target 
powerpc*-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/sms-8.c b/gcc/testsuite/gcc.dg/sms-8.c
index 7ccaa454125..dc0a3fc1f9b 100644
--- a/gcc/testsuite/gcc.dg/sms-8.c
+++ b/gcc/testsuite/gcc.dg/sms-8.c
@@ -34,7 +34,3 @@ main ()
   res = foo (3, 4);
   return 0;
 }
-
-/* { dg-final { scan-rtl-dump-times "SMS succeeded" 1 "sms" { target 
powerpc*-*-* } } } */
-
-
-- 
2.27.0



[PATCH,rs6000] Fix operand order to subf for p10 fusion.

2021-06-02 Thread Aaron Sawdey via Gcc-patches
This certainly causes a bootstrap miscompare, and might also be
responsible for PR/100820. The operands to subf were reversed
in the logical-add/sub fusion patterns, and I screwed up my
bootstrap test which is how it ended up getting committed.

If bootstrap and regtest passes, ok for trunk (and eventual backport to 11.2)?

Thanks!
   Aaron

gcc/ChangeLog

* gcc/config/rs6000/genfusion.pl (gen_logical_addsubf): Fix input
order to subf instruction.
* gcc/config/rs6000/fusion.md: Regenerate input.
---
 gcc/config/rs6000/fusion.md| 64 +-
 gcc/config/rs6000/genfusion.pl | 20 ++-
 2 files changed, 43 insertions(+), 41 deletions(-)

diff --git a/gcc/config/rs6000/fusion.md b/gcc/config/rs6000/fusion.md
index 51912106663..e642ff5f95f 100644
--- a/gcc/config/rs6000/fusion.md
+++ b/gcc/config/rs6000/fusion.md
@@ -1733,10 +1733,10 @@
(clobber (match_scratch:GPR 4 "=X,X,X,"))]
   "(TARGET_P10_FUSION && TARGET_P10_FUSION_LOGADD)"
   "@
-   and %3,%1,%0\;subf %3,%3,%2
-   and %3,%1,%0\;subf %3,%3,%2
-   and %3,%1,%0\;subf %3,%3,%2
-   and %4,%1,%0\;subf %3,%4,%2"
+   and %3,%1,%0\;subf %3,%2,%3
+   and %3,%1,%0\;subf %3,%2,%3
+   and %3,%1,%0\;subf %3,%2,%3
+   and %4,%1,%0\;subf %3,%2,%4"
   [(set_attr "type" "fused_arith_logical")
(set_attr "cost" "6")
(set_attr "length" "8")])
@@ -1751,10 +1751,10 @@
(clobber (match_scratch:GPR 4 "=X,X,X,"))]
   "(TARGET_P10_FUSION && TARGET_P10_FUSION_LOGADD)"
   "@
-   nand %3,%1,%0\;subf %3,%3,%2
-   nand %3,%1,%0\;subf %3,%3,%2
-   nand %3,%1,%0\;subf %3,%3,%2
-   nand %4,%1,%0\;subf %3,%4,%2"
+   nand %3,%1,%0\;subf %3,%2,%3
+   nand %3,%1,%0\;subf %3,%2,%3
+   nand %3,%1,%0\;subf %3,%2,%3
+   nand %4,%1,%0\;subf %3,%2,%4"
   [(set_attr "type" "fused_arith_logical")
(set_attr "cost" "6")
(set_attr "length" "8")])
@@ -1769,10 +1769,10 @@
(clobber (match_scratch:GPR 4 "=X,X,X,"))]
   "(TARGET_P10_FUSION && TARGET_P10_FUSION_LOGADD)"
   "@
-   nor %3,%1,%0\;subf %3,%3,%2
-   nor %3,%1,%0\;subf %3,%3,%2
-   nor %3,%1,%0\;subf %3,%3,%2
-   nor %4,%1,%0\;subf %3,%4,%2"
+   nor %3,%1,%0\;subf %3,%2,%3
+   nor %3,%1,%0\;subf %3,%2,%3
+   nor %3,%1,%0\;subf %3,%2,%3
+   nor %4,%1,%0\;subf %3,%2,%4"
   [(set_attr "type" "fused_arith_logical")
(set_attr "cost" "6")
(set_attr "length" "8")])
@@ -1787,10 +1787,10 @@
(clobber (match_scratch:GPR 4 "=X,X,X,"))]
   "(TARGET_P10_FUSION && TARGET_P10_FUSION_LOGADD)"
   "@
-   or %3,%1,%0\;subf %3,%3,%2
-   or %3,%1,%0\;subf %3,%3,%2
-   or %3,%1,%0\;subf %3,%3,%2
-   or %4,%1,%0\;subf %3,%4,%2"
+   or %3,%1,%0\;subf %3,%2,%3
+   or %3,%1,%0\;subf %3,%2,%3
+   or %3,%1,%0\;subf %3,%2,%3
+   or %4,%1,%0\;subf %3,%2,%4"
   [(set_attr "type" "fused_arith_logical")
(set_attr "cost" "6")
(set_attr "length" "8")])
@@ -1805,10 +1805,10 @@
(clobber (match_scratch:GPR 4 "=X,X,X,"))]
   "(TARGET_P10_FUSION && TARGET_P10_FUSION_LOGADD)"
   "@
-   and %3,%1,%0\;subf %3,%2,%3
-   and %3,%1,%0\;subf %3,%2,%3
-   and %3,%1,%0\;subf %3,%2,%3
-   and %4,%1,%0\;subf %3,%2,%4"
+   and %3,%1,%0\;subf %3,%3,%2
+   and %3,%1,%0\;subf %3,%3,%2
+   and %3,%1,%0\;subf %3,%3,%2
+   and %4,%1,%0\;subf %3,%4,%2"
   [(set_attr "type" "fused_arith_logical")
(set_attr "cost" "6")
(set_attr "length" "8")])
@@ -1823,10 +1823,10 @@
(clobber (match_scratch:GPR 4 "=X,X,X,"))]
   "(TARGET_P10_FUSION && TARGET_P10_FUSION_LOGADD)"
   "@
-   nand %3,%1,%0\;subf %3,%2,%3
-   nand %3,%1,%0\;subf %3,%2,%3
-   nand %3,%1,%0\;subf %3,%2,%3
-   nand %4,%1,%0\;subf %3,%2,%4"
+   nand %3,%1,%0\;subf %3,%3,%2
+   nand %3,%1,%0\;subf %3,%3,%2
+   nand %3,%1,%0\;subf %3,%3,%2
+   nand %4,%1,%0\;subf %3,%4,%2"
   [(set_attr "type" "fused_arith_logical")
(set_attr "cost" "6")
(set_attr "length" "8")])
@@ -1841,10 +1841,10 @@
(clobber (match_scratch:GPR 4 "=X,X,X,"))]
   "(TARGET_P10_FUSION && TARGET_P10_FUSION_LOGADD)"
   "@
-   nor %3,%1,%0\;subf %3,%2,%3
-   nor %3,%1,%0\;subf %3,%2,%3
-   nor %3,%1,%0\;subf %3,%2,%3
-   nor %4,%1,%0\;subf %3,%2,%4"
+   nor %3,%1,%0\;subf %3,%3,%2
+   nor %3,%1,%0\;subf %3,%3,%2
+   nor %3,%1,%0\;subf %3,%3,%2
+   nor %4,%1,%0\;subf %3,%4,%2"
   [(set_attr "type" "fused_arith_logical")
(set_attr "cost" "6")
(set_attr "length" "8")])
@@ -1859,10 +1859,10 @@
(clobber (match_scratch:GPR 4 "=X,X,X,"))]
   "(TARGET_P10_FUSION && TARGET_P10_FUSION_LOGADD)"
   "@
-   or %3,%1,%0\;subf %3,%2,%3
-   or %3,%1,%0\;subf %3,%2,%3
-   or %3,%1,%0\;subf %3,%2,%3
-   or %4,%1,%0\;subf %3,%2,%4"
+   or %3,%1,%0\;subf %3,%3,%2
+   or %3,%1,%0\;subf %3,%3,%2
+   or %3,%1,%0\;subf %3,%3,%2
+   or %4,%1,%0\;subf %3,%4,%2"
   [(set_attr "type" "fused_arith_logical")
(set_attr "cost" "6")
(set_attr "length" "8")])
diff --git a/gcc/config/rs6000/genfusion.pl b/gcc/config/rs6000/genfusion.pl
index 1285dd42043..577b9553deb 100755
--- a/gcc/config/rs6000/genfusion.pl
+++ b/gcc/config/rs6000/genfusion.pl
@@ -166,7 +166,7 @@ sub 

[PATCH,rs6000] Fix p10 fusion test cases for -m32

2021-05-26 Thread Aaron Sawdey via Gcc-patches
For some reason this never showed up on gcc-patches, trying again.

Aaron Sawdey, Ph.D. saw...@linux.ibm.com
IBM Linux on POWER Toolchain
 

> Begin forwarded message:
> 
> From: Aaron Sawdey 
> Subject: [PATCH,rs6000] Fix p10 fusion test cases for -m32
> Date: May 25, 2021 at 1:45:36 PM CDT
> To: gcc-patches@gcc.gnu.org
> Cc: seg...@kernel.crashing.org, wschm...@linux.ibm.com, 
> will_schm...@vnet.ibm.com, Aaron Sawdey 
> 
> From: Aaron Sawdey 
> 
> The counts of fusion insns are slightly different for 32-bit compiles
> so we need different scan-assembler-times counts for 32 and 64 bit
> in the test cases for p10 fusion.
> 
> Is this test case cleanup ok for trunk (and eventually 11.2 when
> the other fusion patches go there)?
> 
> Thanks!
>   Aaron
> 
> gcc/testsuite/ChangeLog
> 
>   * gcc.target/powerpc/fusion-p10-2logical.c: Update fused insn
>   counts to test 32 and 64 bit separately.
>   * gcc.target/powerpc/fusion-p10-addadd.c: Update fused insn
>   counts to test 32 and 64 bit separately.
>   * gcc.target/powerpc/fusion-p10-ldcmpi.c: Update fused insn
>   counts to test 32 and 64 bit separately.
>   * gcc.target/powerpc/fusion-p10-logadd.c: Update fused insn
>   counts to test 32 and 64 bit separately.
> ---
> .../gcc.target/powerpc/fusion-p10-2logical.c   | 394 ++---
> .../gcc.target/powerpc/fusion-p10-addadd.c |  19 +-
> .../gcc.target/powerpc/fusion-p10-ldcmpi.c |  34 +-
> .../gcc.target/powerpc/fusion-p10-logadd.c | 133 ---
> 4 files changed, 382 insertions(+), 198 deletions(-)
> 
> diff --git a/gcc/testsuite/gcc.target/powerpc/fusion-p10-2logical.c 
> b/gcc/testsuite/gcc.target/powerpc/fusion-p10-2logical.c
> index 9a20537..de22176 100644
> --- a/gcc/testsuite/gcc.target/powerpc/fusion-p10-2logical.c
> +++ b/gcc/testsuite/gcc.target/powerpc/fusion-p10-2logical.c
> @@ -64,142 +64,262 @@ TEST(vboolchar_t);
> TEST(vuint_t);
> 
> /* Recreate with:
> -   grep ' \*fuse_' fusion-p10-2logical.s|sed -e 's,^.*\*,,' |sort -k 7,7 
> |uniq -c|awk '{l=30-length($2); printf("/%s* { %s { scan-assembler-times 
> \"%s\"%-*s%4d } } *%s/\n","","dg-final",$2,l,"",$1,"");}'
> +   grep ' \*fuse_' fusion-p10-2logical.s|sed -e 's,^.*\*,,' |sort -k 7,7 
> |uniq -c|awk '{l=30-length($2); printf("/%s* { %s { scan-assembler-times 
> \"%s\"%-*s%4d { target lp64 } } } 
> *%s/\n","","dg-final",$2,l,"",$1,"");}'
>  */
> 
> -/* { dg-final { scan-assembler-times "fuse_and_and/1"
>   16 } } */
> -/* { dg-final { scan-assembler-times "fuse_and_and/2"
>   16 } } */
> -/* { dg-final { scan-assembler-times "fuse_andc_and/0"   
>   16 } } */
> -/* { dg-final { scan-assembler-times "fuse_andc_and/1"   
>   26 } } */
> -/* { dg-final { scan-assembler-times "fuse_andc_and/2"   
>   48 } } */
> -/* { dg-final { scan-assembler-times "fuse_andc_and/3"   
>6 } } */
> -/* { dg-final { scan-assembler-times "fuse_andc_or/0"
>   16 } } */
> -/* { dg-final { scan-assembler-times "fuse_andc_or/1"
>   16 } } */
> -/* { dg-final { scan-assembler-times "fuse_andc_or/2"
>   32 } } */
> -/* { dg-final { scan-assembler-times "fuse_andc_orc/0"   
>8 } } */
> -/* { dg-final { scan-assembler-times "fuse_andc_orc/1"   
>8 } } */
> -/* { dg-final { scan-assembler-times "fuse_andc_orc/2"   
>   48 } } */
> -/* { dg-final { scan-assembler-times "fuse_andc_xor/0"   
>   16 } } */
> -/* { dg-final { scan-assembler-times "fuse_andc_xor/1"   
>   16 } } */
> -/* { dg-final { scan-assembler-times "fuse_andc_xor/2"   
>   32 } } */
> -/* { dg-final { scan-assembler-times "fuse_and_eqv/0"
>8 } } */
> -/* { dg-final { scan-assembler-times "fuse_and_eqv/2"
>   24 } } */
> -/* { dg-final { scan-assembler-times "fuse_and_or/0" 
>   16 } } */
> -/* { dg-final { scan-assembler-times "fuse_and_or/2" 
>   16 } } */
> -/* { dg-final { scan-assembler-times "fuse_and_orc/0"
>8 } } */
> -/* { dg-final { scan-assembler-times "

Re: [PATCH,rs6000 2/2] Fusion patterns for add-logical/logical-add

2021-05-24 Thread Aaron Sawdey via Gcc-patches
One last addendum to this. I discovered that that needs a "sort" 
in front of "keys %logicals_addsub" because otherwise you may get
the operators in different orders sometimes which leads to fusion.md
having the patterns in different orders which isn't helpful for
sane debugging. Segher and I discussed it offline so I'm posting 
the final patch for posterity.

Also coming will be some updates to the test cases. Things optimize
differently with -m32 apparently so I'll have to add different counts
of the counts of the different fusion patterns for "{ target ilp32 }"
as the current counts in those files only apply to "{ target lp64 }".

  Aaron

gcc/ChangeLog:
* config/rs6000/genfusion.pl (gen_logical_addsubf): Refactor to
add generation of logical-add and add-logical fusion pairs.
* config/rs6000/rs6000-cpus.def: Add new fusion to ISA 3.1 mask
and powerpc mask.
* config/rs6000/rs6000.c (rs6000_option_override_internal): Turn on
logical-add and add-logical fusion by default.
* config/rs6000.opt: Add -mpower10-fusion-logical-add and
-mpower10-fusion-add-logical options.
* config/rs6000/fusion.md: Regenerate file.

gcc/testsuite/ChangeLog:
* gcc.target/powerpc/fusion-p10-logadd.c: New file.
---
 gcc/config/rs6000/fusion.md   | 872 +-
 gcc/config/rs6000/genfusion.pl|  83 +-
 gcc/config/rs6000/rs6000-cpus.def |   4 +
 gcc/config/rs6000/rs6000.c|   8 +
 gcc/config/rs6000/rs6000.opt  |  12 +-
 .../gcc.target/powerpc/fusion-p10-logadd.c|  97 ++
 6 files changed, 797 insertions(+), 279 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/fusion-p10-logadd.c

diff --git a/gcc/config/rs6000/fusion.md b/gcc/config/rs6000/fusion.md
index 4d810e6ba13..51912106663 100644
--- a/gcc/config/rs6000/fusion.md
+++ b/gcc/config/rs6000/fusion.md
@@ -355,11 +355,11 @@ (define_insn_and_split "*lbz_cmpldi_cr0_QI_GPR_CCUNS_zero"
(set_attr "length" "8")])
 
 
-;; logical-logical fusion pattern generated by gen_2logical
+;; logical-logical fusion pattern generated by gen_logical_addsubf
 ;; scalar and -> and
 (define_insn "*fuse_and_and"
   [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,,r")
-(and:GPR (and:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") 
+(and:GPR (and:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r")
   (match_operand:GPR 1 "gpc_reg_operand" "%r,r,r,r"))
  (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))
(clobber (match_scratch:GPR 4 "=X,X,X,"))]
@@ -373,11 +373,11 @@ (define_insn "*fuse_and_and"
(set_attr "cost" "6")
(set_attr "length" "8")])
 
-;; logical-logical fusion pattern generated by gen_2logical
+;; logical-logical fusion pattern generated by gen_logical_addsubf
 ;; scalar andc -> and
 (define_insn "*fuse_andc_and"
   [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,,r")
-(and:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" 
"r,r,r,r")) 
+(and:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" 
"r,r,r,r"))
   (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))
  (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))
(clobber (match_scratch:GPR 4 "=X,X,X,"))]
@@ -391,11 +391,11 @@ (define_insn "*fuse_andc_and"
(set_attr "cost" "6")
(set_attr "length" "8")])
 
-;; logical-logical fusion pattern generated by gen_2logical
+;; logical-logical fusion pattern generated by gen_logical_addsubf
 ;; scalar eqv -> and
 (define_insn "*fuse_eqv_and"
   [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,,r")
-(and:GPR (not:GPR (xor:GPR (match_operand:GPR 0 "gpc_reg_operand" 
"r,r,r,r") 
+(and:GPR (not:GPR (xor:GPR (match_operand:GPR 0 "gpc_reg_operand" 
"r,r,r,r")
   (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")))
  (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))
(clobber (match_scratch:GPR 4 "=X,X,X,"))]
@@ -409,11 +409,11 @@ (define_insn "*fuse_eqv_and"
(set_attr "cost" "6")
(set_attr "length" "8")])
 
-;; logical-logical fusion pattern generated by gen_2logical
+;; logical-logical fusion pattern generated by gen_logical_addsubf
 ;; scalar nand -> and
 (define_insn "*fuse_nand_and"
   [(set (match_operand:GPR 3 "gpc_reg_operand" "=0,1,,r")
-(and:GPR (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" 
"r,r,r,r")) 
+(and:GPR (ior:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" 
"r,r,r,r"))
   (not:GPR (match_operand:GPR 1 "gpc_reg_operand" 
"r,r,r,r")))
  (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))
(clobber (match_scratch:GPR 4 "=X,X,X,"))]
@@ -427,11 +427,11 @@ (define_insn "*fuse_nand_and"
(set_attr "cost" "6")
(set_attr "length" "8")])
 
-;; logical-logical fusion pattern generated by gen_2logical

Re: [PATCH,rs6000] Test cases for p10 fusion patterns

2021-05-11 Thread Aaron Sawdey via Gcc-patches
Ping.

Aaron Sawdey, Ph.D. saw...@linux.ibm.com
IBM Linux on POWER Toolchain
 

> On Apr 26, 2021, at 2:00 PM, acsaw...@linux.ibm.com wrote:
> 
> From: Aaron Sawdey 
> 
> This adds some test cases to make sure that the combine patterns for p10
> fusion are working.
> 
> OK for trunk?
> 
> gcc/testsuite/ChangeLog:
>   * gcc.target/powerpc/fusion-p10-ldcmpi.c: New file.
>   * gcc.target/powerpc/fusion-p10-2logical.c: New file.
> ---
> .../gcc.target/powerpc/fusion-p10-2logical.c  | 205 ++
> .../gcc.target/powerpc/fusion-p10-ldcmpi.c|  66 ++
> 2 files changed, 271 insertions(+)
> create mode 100644 gcc/testsuite/gcc.target/powerpc/fusion-p10-2logical.c
> create mode 100644 gcc/testsuite/gcc.target/powerpc/fusion-p10-ldcmpi.c
> 
> diff --git a/gcc/testsuite/gcc.target/powerpc/fusion-p10-2logical.c 
> b/gcc/testsuite/gcc.target/powerpc/fusion-p10-2logical.c
> new file mode 100644
> index 000..9a205373505
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/fusion-p10-2logical.c
> @@ -0,0 +1,205 @@
> +/* { dg-do compile { target { powerpc*-*-* } } } */
> +/* { dg-skip-if "" { powerpc*-*-darwin* } } */
> +/* { dg-options "-mdejagnu-cpu=power10 -O3 -dp" } */
> +
> +#include 
> +#include 
> +
> +/* and/andc/eqv/nand/nor/or/orc/xor */
> +#define AND(a,b) ((a)&(b))
> +#define ANDC1(a,b) ((a)&((~b)))
> +#define ANDC2(a,b) ((~(a))&(b))
> +#define EQV(a,b) (~((a)^(b)))
> +#define NAND(a,b) (~((a)&(b)))
> +#define NOR(a,b) (~((a)|(b)))
> +#define OR(a,b) ((a)|(b))
> +#define ORC1(a,b) ((a)|((~b)))
> +#define ORC2(a,b) ((~(a))|(b))
> +#define XOR(a,b) ((a)^(b))
> +#define TEST1(type, func)
> \
> +  type func ## _and_T_ ## type (type a, type b, type c) { return 
> AND(func(a,b),c); } \
> +  type func ## _andc1_T_   ## type (type a, type b, type c) { return 
> ANDC1(func(a,b),c); } \
> +  type func ## _andc2_T_   ## type (type a, type b, type c) { return 
> ANDC2(func(a,b),c); } \
> +  type func ## _eqv_T_ ## type (type a, type b, type c) { return 
> EQV(func(a,b),c); } \
> +  type func ## _nand_T_## type (type a, type b, type c) { return 
> NAND(func(a,b),c); } \
> +  type func ## _nor_T_ ## type (type a, type b, type c) { return 
> NOR(func(a,b),c); } \
> +  type func ## _or_T_  ## type (type a, type b, type c) { return 
> OR(func(a,b),c); } \
> +  type func ## _orc1_T_## type (type a, type b, type c) { return 
> ORC1(func(a,b),c); } \
> +  type func ## _orc2_T_## type (type a, type b, type c) { return 
> ORC2(func(a,b),c); } \
> +  type func ## _xor_T_ ## type (type a, type b, type c) { return 
> XOR(func(a,b),c); } \
> +  type func ## _rev_and_T_ ## type (type a, type b, type c) { return 
> AND(c,func(a,b)); } \
> +  type func ## _rev_andc1_T_   ## type (type a, type b, type c) { return 
> ANDC1(c,func(a,b)); } \
> +  type func ## _rev_andc2_T_   ## type (type a, type b, type c) { return 
> ANDC2(c,func(a,b)); } \
> +  type func ## _rev_eqv_T_ ## type (type a, type b, type c) { return 
> EQV(c,func(a,b)); } \
> +  type func ## _rev_nand_T_## type (type a, type b, type c) { return 
> NAND(c,func(a,b)); } \
> +  type func ## _rev_nor_T_ ## type (type a, type b, type c) { return 
> NOR(c,func(a,b)); } \
> +  type func ## _rev_or_T_  ## type (type a, type b, type c) { return 
> OR(c,func(a,b)); } \
> +  type func ## _rev_orc1_T_## type (type a, type b, type c) { return 
> ORC1(c,func(a,b)); } \
> +  type func ## _rev_orc2_T_## type (type a, type b, type c) { return 
> ORC2(c,func(a,b)); } \
> +  type func ## _rev_xor_T_ ## type (type a, type b, type c) { return 
> XOR(c,func(a,b)); }
> +#define TEST(type)\
> +  TEST1(type,AND) \
> +  TEST1(type,ANDC1)   \
> +  TEST1(type,ANDC2)   \
> +  TEST1(type,EQV) \
> +  TEST1(type,NAND)\
> +  TEST1(type,NOR) \
> +  TEST1(type,OR)  \
> +  TEST1(type,ORC1)\
> +  TEST1(type,ORC2)\
> +  TEST1(type,XOR)
> +
> +typedef vector bool char vboolchar_t;
> +typedef vector unsigned int vuint_t;
> +
> +TEST(uint8_t);
> +TEST(int8_t);
> +TEST(uint16_t);
> +TEST(int16_t);
> +TEST(uint32_t);
> +TEST(int32_t);
> +TEST(uint64_t);
> +TEST(int64_t);
> +TEST(vboolchar_t);
> +TEST(vuint_t);
> +
> +/* Recreate with:
> +   grep ' \*fuse_' fusion-p10-2logical.s|sed -e 's,^.*\*,,' |sort -k 7,7 
> |uniq -c|awk '{l=30-length($2); printf("/%s* { %s { scan-assembler-times 
> \"%s\"%-*s%4d } } *%s/\n","","dg-final",$2,l,"",$1,"");}'
> + */
> +  
> +/* { dg-fin

Re: [PATCH,rs6000] Add insn types for fusion pairs

2021-05-11 Thread Aaron Sawdey via Gcc-patches
Ping.

In answer to Will’s question — some of these are not immediately used but will 
be in other pending patches.

Aaron Sawdey, Ph.D. saw...@linux.ibm.com
IBM Linux on POWER Toolchain
 

> On Apr 26, 2021, at 1:04 PM, acsaw...@linux.ibm.com wrote:
> 
> From: Aaron Sawdey 
> 
> This adds new values for insn attr type for p10 fusion. The genfusion.pl
> script is modified to use them, and fusion.md regenerated to capture
> the new patterns. There are also some formatting only changes to
> fusion.md that apparently weren't captured after a previous commit
> of genfusion.pl.
> 
> If bootstrap/regtest passes, OK for trunk and backport to 11.2?
> 
> Thanks,
>Aaron
> 
> gcc/
>   * rs6000.md (define_attr "type"): Add types for fusion.
>   * genfusion.md (gen_ld_cmpi_p10): Use new fusion types.
>   (gen_2logical): Use new fusion types.
>   * fusion.md: Regenerate.
> ---
> gcc/config/rs6000/fusion.md| 288 -
> gcc/config/rs6000/genfusion.pl |   8 +-
> gcc/config/rs6000/rs6000.md|   4 +-
> 3 files changed, 152 insertions(+), 148 deletions(-)
> 
> diff --git a/gcc/config/rs6000/fusion.md b/gcc/config/rs6000/fusion.md
> index 56478fcae1d..6d71bc2df73 100644
> --- a/gcc/config/rs6000/fusion.md
> +++ b/gcc/config/rs6000/fusion.md
> @@ -35,7 +35,7 @@ (define_insn_and_split "*ld_cmpdi_cr0_DI_clobber_CC_none"
>(set (match_dup 2)
> (compare:CC (match_dup 0) (match_dup 3)))]
>   ""
> -  [(set_attr "type" "load")
> +  [(set_attr "type" "fused_load_cmpi")
>(set_attr "cost" "8")
>(set_attr "length" "8")])
> 
> @@ -56,7 +56,7 @@ (define_insn_and_split 
> "*ld_cmpldi_cr0_DI_clobber_CCUNS_none"
>(set (match_dup 2)
> (compare:CCUNS (match_dup 0) (match_dup 3)))]
>   ""
> -  [(set_attr "type" "load")
> +  [(set_attr "type" "fused_load_cmpi")
>(set_attr "cost" "8")
>(set_attr "length" "8")])
> 
> @@ -77,7 +77,7 @@ (define_insn_and_split "*ld_cmpdi_cr0_DI_DI_CC_none"
>(set (match_dup 2)
> (compare:CC (match_dup 0) (match_dup 3)))]
>   ""
> -  [(set_attr "type" "load")
> +  [(set_attr "type" "fused_load_cmpi")
>(set_attr "cost" "8")
>(set_attr "length" "8")])
> 
> @@ -98,7 +98,7 @@ (define_insn_and_split "*ld_cmpldi_cr0_DI_DI_CCUNS_none"
>(set (match_dup 2)
> (compare:CCUNS (match_dup 0) (match_dup 3)))]
>   ""
> -  [(set_attr "type" "load")
> +  [(set_attr "type" "fused_load_cmpi")
>(set_attr "cost" "8")
>(set_attr "length" "8")])
> 
> @@ -119,7 +119,7 @@ (define_insn_and_split "*lwa_cmpdi_cr0_SI_clobber_CC_none"
>(set (match_dup 2)
> (compare:CC (match_dup 0) (match_dup 3)))]
>   ""
> -  [(set_attr "type" "load")
> +  [(set_attr "type" "fused_load_cmpi")
>(set_attr "cost" "8")
>(set_attr "length" "8")])
> 
> @@ -140,7 +140,7 @@ (define_insn_and_split 
> "*lwz_cmpldi_cr0_SI_clobber_CCUNS_none"
>(set (match_dup 2)
> (compare:CCUNS (match_dup 0) (match_dup 3)))]
>   ""
> -  [(set_attr "type" "load")
> +  [(set_attr "type" "fused_load_cmpi")
>(set_attr "cost" "8")
>(set_attr "length" "8")])
> 
> @@ -161,7 +161,7 @@ (define_insn_and_split "*lwa_cmpdi_cr0_SI_SI_CC_none"
>(set (match_dup 2)
> (compare:CC (match_dup 0) (match_dup 3)))]
>   ""
> -  [(set_attr "type" "load")
> +  [(set_attr "type" "fused_load_cmpi")
>(set_attr "cost" "8")
>(set_attr "length" "8")])
> 
> @@ -182,7 +182,7 @@ (define_insn_and_split "*lwz_cmpldi_cr0_SI_SI_CCUNS_none"
>(set (match_dup 2)
> (compare:CCUNS (match_dup 0) (match_dup 3)))]
>   ""
> -  [(set_attr "type" "load")
> +  [(set_attr "type" "fused_load_cmpi")
>(set_attr "cost" "8")
>(set_attr "length" "8")])
> 
> @@ -203,7 +203,7 @@ (define_insn_and_split "*lwa_cmpdi_cr0_SI_EXTSI_CC_sign"
>(set (match_dup 2)
> (compare:CC (match_dup 0) (match_dup 3)))]
>

Re: [PATCH,rs6000 0/2] p10 add-add and add-logical fusion series

2021-05-11 Thread Aaron Sawdey via Gcc-patches
Ping.

Aaron Sawdey, Ph.D. saw...@linux.ibm.com
IBM Linux on POWER Toolchain
 

> On Apr 26, 2021, at 3:21 PM, acsaw...@linux.ibm.com wrote:
> 
> From: Aaron Sawdey 
> 
> Two more sets of combine patterns for p10 fusion. These require 
> the "Add insn types for fusion pairs" patch I posted earlier today.
> 
> If ok I would like to put these in gcc 12 trunk and backport for 11.2.
> 
> Thanks,
>   Aaron
> 
> Aaron Sawdey (2):
>  combine patterns for add-add fusion
>  Fusion patterns for add-logical/logical-add
> 
> gcc/config/rs6000/fusion.md   | 908 +-
> gcc/config/rs6000/genfusion.pl| 127 ++-
> gcc/config/rs6000/rs6000-cpus.def |   8 +-
> gcc/config/rs6000/rs6000.c|   9 +
> gcc/config/rs6000/rs6000.opt  |  12 +
> .../gcc.target/powerpc/fusion-p10-addadd.c|  41 +
> .../gcc.target/powerpc/fusion-p10-logadd.c|  98 ++
> 7 files changed, 925 insertions(+), 278 deletions(-)
> create mode 100644 gcc/testsuite/gcc.target/powerpc/fusion-p10-addadd.c
> create mode 100644 gcc/testsuite/gcc.target/powerpc/fusion-p10-logadd.c
> 
> -- 
> 2.27.0
> 



Re: [PATCH,rs6000] Optimize pcrel access of globals [ping]

2021-01-18 Thread Aaron Sawdey via Gcc-patches
Ping.

Aaron Sawdey, Ph.D. saw...@linux.ibm.com
IBM Linux on POWER Toolchain
 

> On Dec 9, 2020, at 11:04 AM, acsaw...@linux.ibm.com wrote:
> 
> From: Aaron Sawdey 
> 
> Ping. I've folded in the changes to comments suggested by Will Schmidt.
> 
> This patch implements a RTL pass that looks for pc-relative loads of the
> address of an external variable using the PCREL_GOT relocation and a
> single load or store that uses that external address.
> 
> Produced by a cast of thousands:
> * Michael Meissner
> * Peter Bergner
> * Bill Schmidt
> * Alan Modra
> * Segher Boessenkool
> * Aaron Sawdey
> 
> Passes bootstrap/regtest on ppc64le power10. Should have no effect on
> other processors. OK for trunk?
> 
> Thanks!
>   Aaron
> 
> gcc/ChangeLog:
> 
>   * config.gcc: Add pcrel-opt.c and pcrel-opt.o.
>   * config/rs6000/pcrel-opt.c: New file.
>   * config/rs6000/pcrel-opt.md: New file.
>   * config/rs6000/predicates.md: Add d_form_memory predicate.
>   * config/rs6000/rs6000-cpus.def: Add OPTION_MASK_PCREL_OPT.
>   * config/rs6000/rs6000-passes.def: Add pass_pcrel_opt.
>   * config/rs6000/rs6000-protos.h: Add reg_to_non_prefixed(),
>   offsettable_non_prefixed_memory(), output_pcrel_opt_reloc(),
>   and make_pass_pcrel_opt().
>   * config/rs6000/rs6000.c (reg_to_non_prefixed): Make global.
>   (rs6000_option_override_internal): Add pcrel-opt.
>   (rs6000_delegitimize_address): Support pcrel-opt.
>   (rs6000_opt_masks): Add pcrel-opt.
>   (offsettable_non_prefixed_memory): New function.
>   (reg_to_non_prefixed): Make global.
>   (rs6000_asm_output_opcode): Reset next_insn_prefixed_p.
>   (output_pcrel_opt_reloc): New function.
>   * config/rs6000/rs6000.md (loads_extern_addr): New attr.
>   (pcrel_extern_addr): Set loads_extern_addr.
>   Add include for pcrel-opt.md.
>   * config/rs6000/rs6000.opt: Add -mpcrel-opt.
>   * config/rs6000/t-rs6000: Add rules for pcrel-opt.c and
>   pcrel-opt.md.
> 
> gcc/testsuite/ChangeLog:
> 
>   * gcc.target/powerpc/pcrel-opt-inc-di.c: New test.
>   * gcc.target/powerpc/pcrel-opt-ld-df.c: New test.
>   * gcc.target/powerpc/pcrel-opt-ld-di.c: New test.
>   * gcc.target/powerpc/pcrel-opt-ld-hi.c: New test.
>   * gcc.target/powerpc/pcrel-opt-ld-qi.c: New test.
>   * gcc.target/powerpc/pcrel-opt-ld-sf.c: New test.
>   * gcc.target/powerpc/pcrel-opt-ld-si.c: New test.
>   * gcc.target/powerpc/pcrel-opt-ld-vector.c: New test.
>   * gcc.target/powerpc/pcrel-opt-st-df.c: New test.
>   * gcc.target/powerpc/pcrel-opt-st-di.c: New test.
>   * gcc.target/powerpc/pcrel-opt-st-hi.c: New test.
>   * gcc.target/powerpc/pcrel-opt-st-qi.c: New test.
>   * gcc.target/powerpc/pcrel-opt-st-sf.c: New test.
>   * gcc.target/powerpc/pcrel-opt-st-si.c: New test.
>   * gcc.target/powerpc/pcrel-opt-st-vector.c: New test.
> ---
> gcc/config.gcc|   6 +-
> gcc/config/rs6000/pcrel-opt.c | 888 ++
> gcc/config/rs6000/pcrel-opt.md| 386 
> gcc/config/rs6000/predicates.md   |  23 +
> gcc/config/rs6000/rs6000-cpus.def |   2 +
> gcc/config/rs6000/rs6000-passes.def   |   8 +
> gcc/config/rs6000/rs6000-protos.h |   4 +
> gcc/config/rs6000/rs6000.c| 116 ++-
> gcc/config/rs6000/rs6000.md   |   8 +-
> gcc/config/rs6000/rs6000.opt  |   4 +
> gcc/config/rs6000/t-rs6000|   7 +-
> .../gcc.target/powerpc/pcrel-opt-inc-di.c |  18 +
> .../gcc.target/powerpc/pcrel-opt-ld-df.c  |  36 +
> .../gcc.target/powerpc/pcrel-opt-ld-di.c  |  43 +
> .../gcc.target/powerpc/pcrel-opt-ld-hi.c  |  42 +
> .../gcc.target/powerpc/pcrel-opt-ld-qi.c  |  42 +
> .../gcc.target/powerpc/pcrel-opt-ld-sf.c  |  42 +
> .../gcc.target/powerpc/pcrel-opt-ld-si.c  |  41 +
> .../gcc.target/powerpc/pcrel-opt-ld-vector.c  |  36 +
> .../gcc.target/powerpc/pcrel-opt-st-df.c  |  36 +
> .../gcc.target/powerpc/pcrel-opt-st-di.c  |  37 +
> .../gcc.target/powerpc/pcrel-opt-st-hi.c  |  42 +
> .../gcc.target/powerpc/pcrel-opt-st-qi.c  |  42 +
> .../gcc.target/powerpc/pcrel-opt-st-sf.c  |  36 +
> .../gcc.target/powerpc/pcrel-opt-st-si.c  |  41 +
> .../gcc.target/powerpc/pcrel-opt-st-vector.c  |  36 +
> 26 files changed, 2013 insertions(+), 9 deletions(-)
> create mode 100644 gcc/config/rs6000/pcrel-opt.c
> create mode 100644 gcc/config/rs6000/pcrel-opt.md
> create mode 100644 gcc/testsuite/gcc.target/powerpc/pcrel-opt-inc-di.c
> create mode 100644 gcc/testsuite/gcc.target/powerpc/pcr

Re: [PATCH,rs6000] Test cases for p10 fusion patterns

2021-01-18 Thread Aaron Sawdey via Gcc-patches
Ping.

Aaron Sawdey, Ph.D. saw...@linux.ibm.com
IBM Linux on POWER Toolchain
 

> On Jan 3, 2021, at 2:44 PM, Aaron Sawdey  wrote:
> 
> Ping.
> 
> Aaron Sawdey, Ph.D. saw...@linux.ibm.com
> IBM Linux on POWER Toolchain
> 
> 
>> On Dec 11, 2020, at 1:53 PM, acsaw...@linux.ibm.com wrote:
>> 
>> From: Aaron Sawdey 
>> 
>> This adds some test cases to make sure that the combine patterns for p10
>> fusion are working.
>> 
>> These test cases pass on power10. OK for trunk after the 2 previous patches
>> for the fusion patterns go in?
>> 
>> Thanks!
>>  Aaron
>> 
>> gcc/testsuite/ChangeLog:
>>  * gcc.target/powerpc/fusion-p10-ldcmpi.c: New file.
>>  * gcc.target/powerpc/fusion-p10-2logical.c: New file.
>> ---
>> .../gcc.target/powerpc/fusion-p10-2logical.c  | 201 ++
>> .../gcc.target/powerpc/fusion-p10-ldcmpi.c|  66 ++
>> 2 files changed, 267 insertions(+)
>> create mode 100644 gcc/testsuite/gcc.target/powerpc/fusion-p10-2logical.c
>> create mode 100644 gcc/testsuite/gcc.target/powerpc/fusion-p10-ldcmpi.c
>> 
>> diff --git a/gcc/testsuite/gcc.target/powerpc/fusion-p10-2logical.c 
>> b/gcc/testsuite/gcc.target/powerpc/fusion-p10-2logical.c
>> new file mode 100644
>> index 000..cfe8f6c679a
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/powerpc/fusion-p10-2logical.c
>> @@ -0,0 +1,201 @@
>> +/* { dg-do compile { target { powerpc*-*-* } } } */
>> +/* { dg-skip-if "" { powerpc*-*-darwin* } } */
>> +/* { dg-options "-mdejagnu-cpu=power10 -O3 -dp" } */
>> +
>> +#include 
>> +#include 
>> +
>> +/* and/andc/eqv/nand/nor/or/orc/xor */
>> +#define AND(a,b) ((a)&(b))
>> +#define ANDC1(a,b) ((a)&((~b)))
>> +#define ANDC2(a,b) ((~(a))&(b))
>> +#define EQV(a,b) (~((a)^(b)))
>> +#define NAND(a,b) (~((a)&(b)))
>> +#define NOR(a,b) (~((a)|(b)))
>> +#define OR(a,b) ((a)|(b))
>> +#define ORC1(a,b) ((a)|((~b)))
>> +#define ORC2(a,b) ((~(a))|(b))
>> +#define XOR(a,b) ((a)^(b))
>> +#define TEST1(type, func)   
>> \
>> +  type func ## _and_T_ ## type (type a, type b, type c) { return 
>> AND(func(a,b),c); } \
>> +  type func ## _andc1_T_   ## type (type a, type b, type c) { return 
>> ANDC1(func(a,b),c); } \
>> +  type func ## _andc2_T_   ## type (type a, type b, type c) { return 
>> ANDC2(func(a,b),c); } \
>> +  type func ## _eqv_T_ ## type (type a, type b, type c) { return 
>> EQV(func(a,b),c); } \
>> +  type func ## _nand_T_## type (type a, type b, type c) { return 
>> NAND(func(a,b),c); } \
>> +  type func ## _nor_T_ ## type (type a, type b, type c) { return 
>> NOR(func(a,b),c); } \
>> +  type func ## _or_T_  ## type (type a, type b, type c) { return 
>> OR(func(a,b),c); } \
>> +  type func ## _orc1_T_## type (type a, type b, type c) { return 
>> ORC1(func(a,b),c); } \
>> +  type func ## _orc2_T_## type (type a, type b, type c) { return 
>> ORC2(func(a,b),c); } \
>> +  type func ## _xor_T_ ## type (type a, type b, type c) { return 
>> XOR(func(a,b),c); } \
>> +  type func ## _rev_and_T_ ## type (type a, type b, type c) { return 
>> AND(c,func(a,b)); } \
>> +  type func ## _rev_andc1_T_   ## type (type a, type b, type c) { return 
>> ANDC1(c,func(a,b)); } \
>> +  type func ## _rev_andc2_T_   ## type (type a, type b, type c) { return 
>> ANDC2(c,func(a,b)); } \
>> +  type func ## _rev_eqv_T_ ## type (type a, type b, type c) { return 
>> EQV(c,func(a,b)); } \
>> +  type func ## _rev_nand_T_## type (type a, type b, type c) { return 
>> NAND(c,func(a,b)); } \
>> +  type func ## _rev_nor_T_ ## type (type a, type b, type c) { return 
>> NOR(c,func(a,b)); } \
>> +  type func ## _rev_or_T_  ## type (type a, type b, type c) { return 
>> OR(c,func(a,b)); } \
>> +  type func ## _rev_orc1_T_## type (type a, type b, type c) { return 
>> ORC1(c,func(a,b)); } \
>> +  type func ## _rev_orc2_T_## type (type a, type b, type c) { return 
>> ORC2(c,func(a,b)); } \
>> +  type func ## _rev_xor_T_ ## type (type a, type b, type c) { return 
>> XOR(c,func(a,b)); }
>> +#define TEST(type)\
>> +  TEST1(type,AND) \
>> +  TEST1(type,ANDC1)   \
>> +  TEST1(type,ANDC2)   \
>> +  TEST1(type,EQV) \
>> +  TEST1(type,NAND)\
>> +  TEST1(type,NOR) \
>> +  TEST1(type,OR)  \
>> +  TEST1(type,ORC1)\
>> +  TEST1(type,ORC2)\
>> +  TES

Re: [PATCH,rs6000] Fusion patterns for logical-logical

2021-01-18 Thread Aaron Sawdey via Gcc-patches
Ping.

Aaron Sawdey, Ph.D. saw...@linux.ibm.com
IBM Linux on POWER Toolchain
 

> On Jan 3, 2021, at 2:43 PM, Aaron Sawdey  wrote:
> 
> Ping.
> 
> Aaron Sawdey, Ph.D. saw...@linux.ibm.com
> IBM Linux on POWER Toolchain
> 
> 
>> On Dec 10, 2020, at 8:41 PM, acsaw...@linux.ibm.com wrote:
>> 
>> From: Aaron Sawdey 
>> 
>> This patch adds a new function to genfusion.pl to generate patterns for
>> logical-logical fusion. They are enabled by default for power10 and can
>> be disabled by -mno-power10-fusion-2logical or -mno-power10-fusion.
>> 
>> This patch builds on top of the load-cmpi patch posted earlier this week.
>> 
>> Bootstrap passed on ppc64le/power10, if regtests pass, ok for trunk?
>> 
>> gcc/ChangeLog
>>  * config/rs6000/genfusion.pl (gen_2logical): New function to
>>  generate patterns for logical-logical fusion.
>>  * config/rs6000/fusion.md: Regenerated patterns.
>>  * config/rs6000/rs6000-cpus.def: Add
>>  OPTION_MASK_P10_FUSION_2LOGICAL.
>>  * config/rs6000/rs6000.c (rs6000_option_override_internal):
>>  Enable logical-logical fusion for p10.
>>  * config/rs6000/rs6000.opt: Add -mpower10-fusion-2logical.
>> ---
>> gcc/config/rs6000/fusion.md   | 2176 +
>> gcc/config/rs6000/genfusion.pl|   89 ++
>> gcc/config/rs6000/rs6000-cpus.def |4 +-
>> gcc/config/rs6000/rs6000.c|3 +
>> gcc/config/rs6000/rs6000.opt  |4 +
>> 5 files changed, 2275 insertions(+), 1 deletion(-)
>> 
>> diff --git a/gcc/config/rs6000/fusion.md b/gcc/config/rs6000/fusion.md
>> index a4d3a6ae7f3..1ddbe7fe3d2 100644
>> --- a/gcc/config/rs6000/fusion.md
>> +++ b/gcc/config/rs6000/fusion.md
>> @@ -355,3 +355,2179 @@ (define_insn_and_split 
>> "*lbz_cmpldi_cr0_QI_GPR_CCUNS_zero"
>>   (set_attr "cost" "8")
>>   (set_attr "length" "8")])
>> 
>> +
>> +;; logical-logical fusion pattern generated by gen_2logical
>> +;; kind: scalar outer: and op and rtl and inv 0 comp 0
>> +;; inner: and op and rtl and inv 0 comp 0
>> +(define_insn "*fuse_and_and"
>> +  [(set (match_operand:GPR 3 "gpc_reg_operand" "=,0,1,r")
>> +(and:GPR (and:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") 
>> (match_operand:GPR 1 "gpc_reg_operand" "%r,r,r,r")) (match_operand:GPR 2 
>> "gpc_reg_operand" "r,r,r,r")))
>> +   (clobber (match_scratch:GPR 4 "=X,X,X,r"))]
>> +  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
>> +  "@
>> +   and %3,%1,%0\;and %3,%3,%2
>> +   and %0,%1,%0\;and %0,%0,%2
>> +   and %1,%1,%0\;and %1,%1,%2
>> +   and %4,%1,%0\;and %3,%4,%2"
>> +  [(set_attr "type" "logical")
>> +   (set_attr "cost" "6")
>> +   (set_attr "length" "8")])
>> +
>> +;; logical-logical fusion pattern generated by gen_2logical
>> +;; kind: scalar outer: and op and rtl and inv 0 comp 0
>> +;; inner: andc op andc rtl and inv 0 comp 1
>> +(define_insn "*fuse_andc_and"
>> +  [(set (match_operand:GPR 3 "gpc_reg_operand" "=,0,1,r")
>> +(and:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" 
>> "r,r,r,r")) (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) 
>> (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))
>> +   (clobber (match_scratch:GPR 4 "=X,X,X,r"))]
>> +  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
>> +  "@
>> +   andc %3,%1,%0\;and %3,%3,%2
>> +   andc %0,%1,%0\;and %0,%0,%2
>> +   andc %1,%1,%0\;and %1,%1,%2
>> +   andc %4,%1,%0\;and %3,%4,%2"
>> +  [(set_attr "type" "logical")
>> +   (set_attr "cost" "6")
>> +   (set_attr "length" "8")])
>> +
>> +;; logical-logical fusion pattern generated by gen_2logical
>> +;; kind: scalar outer: and op and rtl and inv 0 comp 0
>> +;; inner: eqv op eqv rtl xor inv 1 comp 0
>> +(define_insn "*fuse_eqv_and"
>> +  [(set (match_operand:GPR 3 "gpc_reg_operand" "=,0,1,r")
>> +(and:GPR (not:GPR (xor:GPR (match_operand:GPR 0 "gpc_reg_operand" 
>> "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) 
>> (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))
>

Re: [PATCH,rs6000] Combine patterns for p10 load-cmpi fusion

2021-01-18 Thread Aaron Sawdey via Gcc-patches
Ping.

Aaron Sawdey, Ph.D. saw...@linux.ibm.com
IBM Linux on POWER Toolchain
 

> On Jan 3, 2021, at 2:42 PM, Aaron Sawdey  wrote:
> 
> Ping.
> 
> I assume we’re going to want a separate patch for the new instruction type.
> 
> Aaron Sawdey, Ph.D. saw...@linux.ibm.com
> IBM Linux on POWER Toolchain
> 
> 
>> On Dec 4, 2020, at 1:19 PM, acsaw...@linux.ibm.com wrote:
>> 
>> From: Aaron Sawdey 
>> 
>> This patch adds the first batch of patterns to support p10 fusion. These
>> will allow combine to create a single insn for a pair of instructions
>> that that power10 can fuse and execute. These particular ones have the
>> requirement that only cr0 can be used when fusing a load with a compare
>> immediate of -1/0/1 (if signed) or 0/1 (if unsigned), so we want combine
>> to put that requirement in, and if it doesn't work out later the splitter
>> can get used.
>> 
>> The patterns are generated by a script genfusion.pl and live in new file
>> fusion.md. This script will be expanded to generate more patterns for
>> fusion.
>> 
>> This also adds option -mpower10-fusion which defaults on for power10 and
>> will gate all these fusion patterns. In addition I have added an
>> undocumented option -mpower10-fusion-ld-cmpi (which may be removed later)
>> that just controls the load+compare-immediate patterns. I have make
>> these default on for power10 but they are not disallowed for earlier
>> processors because it is still valid code. This allows us to test the
>> correctness of fusion code generation by turning it on explicitly.
>> 
>> If bootstrap/regtest is clean, ok for trunk?
>> 
>> Thanks!
>> 
>>  Aaron
>> 
>> gcc/ChangeLog:
>> 
>>  * config/rs6000/genfusion.pl: New file, script to generate
>>  define_insn_and_split patterns so combine can arrange fused
>>  instructions next to each other.
>>  * config/rs6000/fusion.md: New file, generated fused instruction
>>  patterns for combine.
>>  * config/rs6000/predicates.md (const_m1_to_1_operand): New predicate.
>>  (non_update_memory_operand): New predicate.
>>  * config/rs6000/rs6000-cpus.def: Add OPTION_MASK_P10_FUSION and
>>  OPTION_MASK_P10_FUSION_LD_CMPI to ISA_3_1_MASKS_SERVER and
>>  POWERPC_MASKS.
>>  * config/rs6000/rs6000-protos.h (address_is_non_pfx_d_or_x): Add
>>  prototype.
>>  * config/rs6000/rs6000.c (rs6000_option_override_internal):
>>  automatically set -mpower10-fusion and -mpower10-fusion-ld-cmpi
>>  if target is power10.  (rs600_opt_masks): Allow -mpower10-fusion
>>  in function attributes.  (address_is_non_pfx_d_or_x): New function.
>>  * config/rs6000/rs6000.h: Add MASK_P10_FUSION.
>>  * config/rs6000/rs6000.md: Include fusion.md.
>>  * config/rs6000/rs6000.opt: Add -mpower10-fusion
>>  and -mpower10-fusion-ld-cmpi.
>>  * config/rs6000/t-rs6000: Add dependencies involving fusion.md.
>> ---
>> gcc/config/rs6000/fusion.md   | 357 ++
>> gcc/config/rs6000/genfusion.pl| 144 
>> gcc/config/rs6000/predicates.md   |  14 ++
>> gcc/config/rs6000/rs6000-cpus.def |   6 +-
>> gcc/config/rs6000/rs6000-protos.h |   2 +
>> gcc/config/rs6000/rs6000.c|  51 +
>> gcc/config/rs6000/rs6000.h|   1 +
>> gcc/config/rs6000/rs6000.md   |   1 +
>> gcc/config/rs6000/rs6000.opt  |   8 +
>> gcc/config/rs6000/t-rs6000|   6 +-
>> 10 files changed, 588 insertions(+), 2 deletions(-)
>> create mode 100644 gcc/config/rs6000/fusion.md
>> create mode 100755 gcc/config/rs6000/genfusion.pl
>> 
>> diff --git a/gcc/config/rs6000/fusion.md b/gcc/config/rs6000/fusion.md
>> new file mode 100644
>> index 000..a4d3a6ae7f3
>> --- /dev/null
>> +++ b/gcc/config/rs6000/fusion.md
>> @@ -0,0 +1,357 @@
>> +;; -*- buffer-read-only: t -*-
>> +;; Generated automatically by genfusion.pl
>> +
>> +;; Copyright (C) 2020 Free Software Foundation, Inc.
>> +;;
>> +;; This file is part of GCC.
>> +;;
>> +;; GCC is free software; you can redistribute it and/or modify it under
>> +;; the terms of the GNU General Public License as published by the Free
>> +;; Software Foundation; either version 3, or (at your option) any later
>> +;; version.
>> +;;
>> +;; GCC is distributed in the hope that it will be useful, but WITHOUT ANY
>> +;; WARRANTY; without even the implied warranty of MERCHANTABILITY or
>> +;; FITNESS FOR A PARTICULAR PURPOSE.  See the GN

Re: [PATCH,rs6000] Test cases for p10 fusion patterns

2021-01-03 Thread Aaron Sawdey via Gcc-patches
Ping.

Aaron Sawdey, Ph.D. saw...@linux.ibm.com
IBM Linux on POWER Toolchain
 

> On Dec 11, 2020, at 1:53 PM, acsaw...@linux.ibm.com wrote:
> 
> From: Aaron Sawdey 
> 
> This adds some test cases to make sure that the combine patterns for p10
> fusion are working.
> 
> These test cases pass on power10. OK for trunk after the 2 previous patches
> for the fusion patterns go in?
> 
> Thanks!
>   Aaron
> 
> gcc/testsuite/ChangeLog:
>   * gcc.target/powerpc/fusion-p10-ldcmpi.c: New file.
>   * gcc.target/powerpc/fusion-p10-2logical.c: New file.
> ---
> .../gcc.target/powerpc/fusion-p10-2logical.c  | 201 ++
> .../gcc.target/powerpc/fusion-p10-ldcmpi.c|  66 ++
> 2 files changed, 267 insertions(+)
> create mode 100644 gcc/testsuite/gcc.target/powerpc/fusion-p10-2logical.c
> create mode 100644 gcc/testsuite/gcc.target/powerpc/fusion-p10-ldcmpi.c
> 
> diff --git a/gcc/testsuite/gcc.target/powerpc/fusion-p10-2logical.c 
> b/gcc/testsuite/gcc.target/powerpc/fusion-p10-2logical.c
> new file mode 100644
> index 000..cfe8f6c679a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/fusion-p10-2logical.c
> @@ -0,0 +1,201 @@
> +/* { dg-do compile { target { powerpc*-*-* } } } */
> +/* { dg-skip-if "" { powerpc*-*-darwin* } } */
> +/* { dg-options "-mdejagnu-cpu=power10 -O3 -dp" } */
> +
> +#include 
> +#include 
> +
> +/* and/andc/eqv/nand/nor/or/orc/xor */
> +#define AND(a,b) ((a)&(b))
> +#define ANDC1(a,b) ((a)&((~b)))
> +#define ANDC2(a,b) ((~(a))&(b))
> +#define EQV(a,b) (~((a)^(b)))
> +#define NAND(a,b) (~((a)&(b)))
> +#define NOR(a,b) (~((a)|(b)))
> +#define OR(a,b) ((a)|(b))
> +#define ORC1(a,b) ((a)|((~b)))
> +#define ORC2(a,b) ((~(a))|(b))
> +#define XOR(a,b) ((a)^(b))
> +#define TEST1(type, func)
> \
> +  type func ## _and_T_ ## type (type a, type b, type c) { return 
> AND(func(a,b),c); } \
> +  type func ## _andc1_T_   ## type (type a, type b, type c) { return 
> ANDC1(func(a,b),c); } \
> +  type func ## _andc2_T_   ## type (type a, type b, type c) { return 
> ANDC2(func(a,b),c); } \
> +  type func ## _eqv_T_ ## type (type a, type b, type c) { return 
> EQV(func(a,b),c); } \
> +  type func ## _nand_T_## type (type a, type b, type c) { return 
> NAND(func(a,b),c); } \
> +  type func ## _nor_T_ ## type (type a, type b, type c) { return 
> NOR(func(a,b),c); } \
> +  type func ## _or_T_  ## type (type a, type b, type c) { return 
> OR(func(a,b),c); } \
> +  type func ## _orc1_T_## type (type a, type b, type c) { return 
> ORC1(func(a,b),c); } \
> +  type func ## _orc2_T_## type (type a, type b, type c) { return 
> ORC2(func(a,b),c); } \
> +  type func ## _xor_T_ ## type (type a, type b, type c) { return 
> XOR(func(a,b),c); } \
> +  type func ## _rev_and_T_ ## type (type a, type b, type c) { return 
> AND(c,func(a,b)); } \
> +  type func ## _rev_andc1_T_   ## type (type a, type b, type c) { return 
> ANDC1(c,func(a,b)); } \
> +  type func ## _rev_andc2_T_   ## type (type a, type b, type c) { return 
> ANDC2(c,func(a,b)); } \
> +  type func ## _rev_eqv_T_ ## type (type a, type b, type c) { return 
> EQV(c,func(a,b)); } \
> +  type func ## _rev_nand_T_## type (type a, type b, type c) { return 
> NAND(c,func(a,b)); } \
> +  type func ## _rev_nor_T_ ## type (type a, type b, type c) { return 
> NOR(c,func(a,b)); } \
> +  type func ## _rev_or_T_  ## type (type a, type b, type c) { return 
> OR(c,func(a,b)); } \
> +  type func ## _rev_orc1_T_## type (type a, type b, type c) { return 
> ORC1(c,func(a,b)); } \
> +  type func ## _rev_orc2_T_## type (type a, type b, type c) { return 
> ORC2(c,func(a,b)); } \
> +  type func ## _rev_xor_T_ ## type (type a, type b, type c) { return 
> XOR(c,func(a,b)); }
> +#define TEST(type)\
> +  TEST1(type,AND) \
> +  TEST1(type,ANDC1)   \
> +  TEST1(type,ANDC2)   \
> +  TEST1(type,EQV) \
> +  TEST1(type,NAND)\
> +  TEST1(type,NOR) \
> +  TEST1(type,OR)  \
> +  TEST1(type,ORC1)\
> +  TEST1(type,ORC2)\
> +  TEST1(type,XOR)
> +
> +typedef vector bool char vboolchar_t;
> +typedef vector unsigned int vuint_t;
> +
> +TEST(uint8_t);
> +TEST(int8_t);
> +TEST(uint16_t);
> +TEST(int16_t);
> +TEST(uint32_t);
> +TEST(int32_t);
> +TEST(uint64_t);
> +TEST(int64_t);
> +TEST(vboolchar_t);
> +TEST(vuint_t);
> +  
> +/* { dg-final { scan-assembler-times "fuse_and_and/0"16 } } */
> +/* { dg-final { scan-assembler-times "fuse_and_and/2"16 } } */
> +/* { dg-final { scan-assembler-times "fuse_and

Re: [PATCH,rs6000] Fusion patterns for logical-logical

2021-01-03 Thread Aaron Sawdey via Gcc-patches
Ping.

Aaron Sawdey, Ph.D. saw...@linux.ibm.com
IBM Linux on POWER Toolchain
 

> On Dec 10, 2020, at 8:41 PM, acsaw...@linux.ibm.com wrote:
> 
> From: Aaron Sawdey 
> 
> This patch adds a new function to genfusion.pl to generate patterns for
> logical-logical fusion. They are enabled by default for power10 and can
> be disabled by -mno-power10-fusion-2logical or -mno-power10-fusion.
> 
> This patch builds on top of the load-cmpi patch posted earlier this week.
> 
> Bootstrap passed on ppc64le/power10, if regtests pass, ok for trunk?
> 
> gcc/ChangeLog
>   * config/rs6000/genfusion.pl (gen_2logical): New function to
>   generate patterns for logical-logical fusion.
>   * config/rs6000/fusion.md: Regenerated patterns.
>   * config/rs6000/rs6000-cpus.def: Add
>   OPTION_MASK_P10_FUSION_2LOGICAL.
>   * config/rs6000/rs6000.c (rs6000_option_override_internal):
>   Enable logical-logical fusion for p10.
>   * config/rs6000/rs6000.opt: Add -mpower10-fusion-2logical.
> ---
> gcc/config/rs6000/fusion.md   | 2176 +
> gcc/config/rs6000/genfusion.pl|   89 ++
> gcc/config/rs6000/rs6000-cpus.def |4 +-
> gcc/config/rs6000/rs6000.c|3 +
> gcc/config/rs6000/rs6000.opt  |4 +
> 5 files changed, 2275 insertions(+), 1 deletion(-)
> 
> diff --git a/gcc/config/rs6000/fusion.md b/gcc/config/rs6000/fusion.md
> index a4d3a6ae7f3..1ddbe7fe3d2 100644
> --- a/gcc/config/rs6000/fusion.md
> +++ b/gcc/config/rs6000/fusion.md
> @@ -355,3 +355,2179 @@ (define_insn_and_split 
> "*lbz_cmpldi_cr0_QI_GPR_CCUNS_zero"
>(set_attr "cost" "8")
>(set_attr "length" "8")])
> 
> +
> +;; logical-logical fusion pattern generated by gen_2logical
> +;; kind: scalar outer: and op and rtl and inv 0 comp 0
> +;; inner: and op and rtl and inv 0 comp 0
> +(define_insn "*fuse_and_and"
> +  [(set (match_operand:GPR 3 "gpc_reg_operand" "=,0,1,r")
> +(and:GPR (and:GPR (match_operand:GPR 0 "gpc_reg_operand" "r,r,r,r") 
> (match_operand:GPR 1 "gpc_reg_operand" "%r,r,r,r")) (match_operand:GPR 2 
> "gpc_reg_operand" "r,r,r,r")))
> +   (clobber (match_scratch:GPR 4 "=X,X,X,r"))]
> +  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
> +  "@
> +   and %3,%1,%0\;and %3,%3,%2
> +   and %0,%1,%0\;and %0,%0,%2
> +   and %1,%1,%0\;and %1,%1,%2
> +   and %4,%1,%0\;and %3,%4,%2"
> +  [(set_attr "type" "logical")
> +   (set_attr "cost" "6")
> +   (set_attr "length" "8")])
> +
> +;; logical-logical fusion pattern generated by gen_2logical
> +;; kind: scalar outer: and op and rtl and inv 0 comp 0
> +;; inner: andc op andc rtl and inv 0 comp 1
> +(define_insn "*fuse_andc_and"
> +  [(set (match_operand:GPR 3 "gpc_reg_operand" "=,0,1,r")
> +(and:GPR (and:GPR (not:GPR (match_operand:GPR 0 "gpc_reg_operand" 
> "r,r,r,r")) (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r")) 
> (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))
> +   (clobber (match_scratch:GPR 4 "=X,X,X,r"))]
> +  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
> +  "@
> +   andc %3,%1,%0\;and %3,%3,%2
> +   andc %0,%1,%0\;and %0,%0,%2
> +   andc %1,%1,%0\;and %1,%1,%2
> +   andc %4,%1,%0\;and %3,%4,%2"
> +  [(set_attr "type" "logical")
> +   (set_attr "cost" "6")
> +   (set_attr "length" "8")])
> +
> +;; logical-logical fusion pattern generated by gen_2logical
> +;; kind: scalar outer: and op and rtl and inv 0 comp 0
> +;; inner: eqv op eqv rtl xor inv 1 comp 0
> +(define_insn "*fuse_eqv_and"
> +  [(set (match_operand:GPR 3 "gpc_reg_operand" "=,0,1,r")
> +(and:GPR (not:GPR (xor:GPR (match_operand:GPR 0 "gpc_reg_operand" 
> "r,r,r,r") (match_operand:GPR 1 "gpc_reg_operand" "r,r,r,r"))) 
> (match_operand:GPR 2 "gpc_reg_operand" "r,r,r,r")))
> +   (clobber (match_scratch:GPR 4 "=X,X,X,r"))]
> +  "(TARGET_P10_FUSION && TARGET_P10_FUSION_2LOGICAL)"
> +  "@
> +   eqv %3,%1,%0\;and %3,%3,%2
> +   eqv %0,%1,%0\;and %0,%0,%2
> +   eqv %1,%1,%0\;and %1,%1,%2
> +   eqv %4,%1,%0\;and %3,%4,%2"
> +  [(set_attr "type" "logical")
> +   (set_attr "cost" "6")
> +   (set_attr "length" "8")])
> +
> +;; logical-logi

Re: [PATCH,rs6000] Combine patterns for p10 load-cmpi fusion

2021-01-03 Thread Aaron Sawdey via Gcc-patches
Ping.

I assume we’re going to want a separate patch for the new instruction type.

Aaron Sawdey, Ph.D. saw...@linux.ibm.com
IBM Linux on POWER Toolchain
 

> On Dec 4, 2020, at 1:19 PM, acsaw...@linux.ibm.com wrote:
> 
> From: Aaron Sawdey 
> 
> This patch adds the first batch of patterns to support p10 fusion. These
> will allow combine to create a single insn for a pair of instructions
> that that power10 can fuse and execute. These particular ones have the
> requirement that only cr0 can be used when fusing a load with a compare
> immediate of -1/0/1 (if signed) or 0/1 (if unsigned), so we want combine
> to put that requirement in, and if it doesn't work out later the splitter
> can get used.
> 
> The patterns are generated by a script genfusion.pl and live in new file
> fusion.md. This script will be expanded to generate more patterns for
> fusion.
> 
> This also adds option -mpower10-fusion which defaults on for power10 and
> will gate all these fusion patterns. In addition I have added an
> undocumented option -mpower10-fusion-ld-cmpi (which may be removed later)
> that just controls the load+compare-immediate patterns. I have make
> these default on for power10 but they are not disallowed for earlier
> processors because it is still valid code. This allows us to test the
> correctness of fusion code generation by turning it on explicitly.
> 
> If bootstrap/regtest is clean, ok for trunk?
> 
> Thanks!
> 
>   Aaron
> 
> gcc/ChangeLog:
> 
>   * config/rs6000/genfusion.pl: New file, script to generate
>   define_insn_and_split patterns so combine can arrange fused
>   instructions next to each other.
>   * config/rs6000/fusion.md: New file, generated fused instruction
>   patterns for combine.
>   * config/rs6000/predicates.md (const_m1_to_1_operand): New predicate.
>   (non_update_memory_operand): New predicate.
>   * config/rs6000/rs6000-cpus.def: Add OPTION_MASK_P10_FUSION and
>   OPTION_MASK_P10_FUSION_LD_CMPI to ISA_3_1_MASKS_SERVER and
>   POWERPC_MASKS.
>   * config/rs6000/rs6000-protos.h (address_is_non_pfx_d_or_x): Add
>   prototype.
>   * config/rs6000/rs6000.c (rs6000_option_override_internal):
>   automatically set -mpower10-fusion and -mpower10-fusion-ld-cmpi
>   if target is power10.  (rs600_opt_masks): Allow -mpower10-fusion
>   in function attributes.  (address_is_non_pfx_d_or_x): New function.
>   * config/rs6000/rs6000.h: Add MASK_P10_FUSION.
>   * config/rs6000/rs6000.md: Include fusion.md.
>   * config/rs6000/rs6000.opt: Add -mpower10-fusion
>   and -mpower10-fusion-ld-cmpi.
>   * config/rs6000/t-rs6000: Add dependencies involving fusion.md.
> ---
> gcc/config/rs6000/fusion.md   | 357 ++
> gcc/config/rs6000/genfusion.pl| 144 
> gcc/config/rs6000/predicates.md   |  14 ++
> gcc/config/rs6000/rs6000-cpus.def |   6 +-
> gcc/config/rs6000/rs6000-protos.h |   2 +
> gcc/config/rs6000/rs6000.c|  51 +
> gcc/config/rs6000/rs6000.h|   1 +
> gcc/config/rs6000/rs6000.md   |   1 +
> gcc/config/rs6000/rs6000.opt  |   8 +
> gcc/config/rs6000/t-rs6000|   6 +-
> 10 files changed, 588 insertions(+), 2 deletions(-)
> create mode 100644 gcc/config/rs6000/fusion.md
> create mode 100755 gcc/config/rs6000/genfusion.pl
> 
> diff --git a/gcc/config/rs6000/fusion.md b/gcc/config/rs6000/fusion.md
> new file mode 100644
> index 000..a4d3a6ae7f3
> --- /dev/null
> +++ b/gcc/config/rs6000/fusion.md
> @@ -0,0 +1,357 @@
> +;; -*- buffer-read-only: t -*-
> +;; Generated automatically by genfusion.pl
> +
> +;; Copyright (C) 2020 Free Software Foundation, Inc.
> +;;
> +;; This file is part of GCC.
> +;;
> +;; GCC is free software; you can redistribute it and/or modify it under
> +;; the terms of the GNU General Public License as published by the Free
> +;; Software Foundation; either version 3, or (at your option) any later
> +;; version.
> +;;
> +;; GCC is distributed in the hope that it will be useful, but WITHOUT ANY
> +;; WARRANTY; without even the implied warranty of MERCHANTABILITY or
> +;; FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
> +;; for more details.
> +;;
> +;; You should have received a copy of the GNU General Public License
> +;; along with GCC; see the file COPYING3.  If not see
> +;; <http://www.gnu.org/licenses/>.
> +
> +;; load-cmpi fusion pattern generated by gen_ld_cmpi_p10
> +;; load mode is DI result mode is clobber compare mode is CC extend is none
> +(define_insn_and_split "*ld_cmpdi_cr0_DI_clobber_CC_none"
> +  [(set (match_operand:CC 2 "cc_reg_operand" "=x")
> 

Re: [PATCH] Additional small changes to support opaque modes

2020-11-20 Thread Aaron Sawdey via Gcc-patches
> On Nov 20, 2020, at 4:57 AM, Aaron Sawdey via Gcc-patches 
>  wrote:
> 
> 
>> On Nov 20, 2020, at 3:55 AM, Richard Sandiford  
>> wrote:
>> 
>> acsawdey--- via Gcc-patches  writes:
>>> @@ -16767,7 +16768,7 @@ loc_descriptor (rtx rtl, machine_mode mode,
>>>  break;
>>> 
>>>case CONST_INT:
>>> -  if (mode != VOIDmode && mode != BLKmode)
>>> +  if (mode != VOIDmode && mode != BLKmode && !OPAQUE_MODE_P (mode))
>>> {
>>>   int_mode = as_a  (mode);
>>>   loc_result = address_of_int_loc_descriptor (GET_MODE_SIZE (int_mode),
>> 
>> I realise I'm asking this about something that already appears to handle
>> BLKmode CONST_INTs (?!), but this is the one change in the patch I
>> struggled with.  Why do we see a CONST_INT that allegedly has an
>> opaque mode?  It feels like something has gone wrong further up the
>> call chain.
>> 
>> This might still be the expedient fix for whatever is happening,
>> but I think it deserves a comment at least.
>> 
>> The rest looks good to me FWIW.
>> 
>> Richard
> 
> I should look at this again — since I originally put that in, I switched the 
> target
> portion of what I’ve been doing to use an UNSPEC to remove all use of an
> opaque mode const_int from the rtf. This may not be needed any more. 

And as a final addendum — I was able to remove this and the problem I saw
before did not come back, probably because UNSPEC is used to hide all
constants so we never see any opaque type or mode constants, which is a
good thing.

Aaron Sawdey, Ph.D. saw...@linux.ibm.com
IBM Linux on POWER Toolchain
 




Re: [PATCH] Additional small changes to support opaque modes

2020-11-20 Thread Aaron Sawdey via Gcc-patches


> On Nov 20, 2020, at 3:55 AM, Richard Sandiford  
> wrote:
> 
> acsawdey--- via Gcc-patches  writes:
>> diff --git a/gcc/c/c-aux-info.c b/gcc/c/c-aux-info.c
>> index ffc8099856d..41f5598de38 100644
>> --- a/gcc/c/c-aux-info.c
>> +++ b/gcc/c/c-aux-info.c
>> @@ -413,6 +413,10 @@ gen_type (const char *ret_val, tree t, formals_style 
>> style)
>>data_type = IDENTIFIER_POINTER (DECL_NAME (TYPE_NAME (t)));
>>break;
>> 
>> +case OPAQUE_TYPE:
>> +  data_type = IDENTIFIER_POINTER (DECL_NAME (TYPE_NAME (t)));
>> +  break;
>> +
> 
> Might as well just add this case to the REAL_TYPE one.
> 
>>  case VOID_TYPE:
>>data_type = "void";
>>break;
>> […]
>> diff --git a/gcc/dwarf2out.c b/gcc/dwarf2out.c
>> index 54eb445665c..d6d12efff34 100644
>> --- a/gcc/dwarf2out.c
>> +++ b/gcc/dwarf2out.c
>> @@ -13037,6 +13037,7 @@ is_base_type (tree type)
>>   return 1;
>> 
>> case VOID_TYPE:
>> +case OPAQUE_TYPE:
>> case ARRAY_TYPE:
>> case RECORD_TYPE:
>> case UNION_TYPE:
>> @@ -16767,7 +16768,7 @@ loc_descriptor (rtx rtl, machine_mode mode,
>>   break;
>> 
>> case CONST_INT:
>> -  if (mode != VOIDmode && mode != BLKmode)
>> +  if (mode != VOIDmode && mode != BLKmode && !OPAQUE_MODE_P (mode))
>>  {
>>int_mode = as_a  (mode);
>>loc_result = address_of_int_loc_descriptor (GET_MODE_SIZE (int_mode),
> 
> I realise I'm asking this about something that already appears to handle
> BLKmode CONST_INTs (?!), but this is the one change in the patch I
> struggled with.  Why do we see a CONST_INT that allegedly has an
> opaque mode?  It feels like something has gone wrong further up the
> call chain.
> 
> This might still be the expedient fix for whatever is happening,
> but I think it deserves a comment at least.
> 
> The rest looks good to me FWIW.
> 
> Richard

I should look at this again — since I originally put that in, I switched the 
target
portion of what I’ve been doing to use an UNSPEC to remove all use of an
opaque mode const_int from the rtf. This may not be needed any more. 

Thanks,
   Aaron

Aaron Sawdey, Ph.D. saw...@linux.ibm.com
IBM Linux on POWER Toolchain
 




[PATCH,rs6000] Make MMA builtins use opaque modes [v2]

2020-11-19 Thread Aaron Sawdey via Gcc-patches
For some reason this patch never showed up on gcc-patches.

Aaron Sawdey, Ph.D. saw...@linux.ibm.com
IBM Linux on POWER Toolchain
 

> Begin forwarded message:
> 
> From: acsaw...@linux.ibm.com
> Subject: [PATCH,rs6000] Make MMA builtins use opaque modes [v2]
> Date: November 19, 2020 at 12:58:47 PM CST
> To: gcc-patches@gcc.gnu.org
> Cc: seg...@kernel.crashing.org, wschm...@linux.ibm.com, 
> berg...@linux.ibm.com, Aaron Sawdey 
> 
> From: Aaron Sawdey 
> 
> Segher & Bergner -
>  Thanks for the reviews, here's the updated patch after fixing those things.
> We now have an UNSPEC for xxsetaccz, and an accompanying change to
> rs6000_rtx_costs to make it be cost 0 so that CSE doesn't try to replace it
> with a bunch of register moves.
> 
> If bootstrap/regtest looks good, ok for trunk?
> 
> Thanks,
>Aaron
> 
> gcc/
>   * gcc/config/rs6000/mma.md (unspec): Add assemble/extract UNSPECs.
>   (movoi): Change to movoo.
>   (*movpoi): Change to *movoo.
>   (movxi): Change to movxo.
>   (*movpxi): Change to *movxo.
>   (mma_assemble_pair): Change to OO mode.
>   (*mma_assemble_pair): New define_insn_and_split.
>   (mma_disassemble_pair): New define_expand.
>   (*mma_disassemble_pair): New define_insn_and_split.
>   (mma_assemble_acc): Change to XO mode.
>   (*mma_assemble_acc): Change to XO mode.
>   (mma_disassemble_acc): New define_expand.
>   (*mma_disassemble_acc): New define_insn_and_split.
>   (mma_): Change to XO mode.
>   (mma_): Change to XO mode.
>   (mma_): Change to XO mode.
>   (mma_): Change to OO mode.
>   (mma_): Change to XO/OO mode.
>   (mma_): Change to XO mode.
>   (mma_): Change to XO mode.
>   (mma_): Change to XO mode.
>   (mma_): Change to XO mode.
>   (mma_): Change to XO mode.
>   (mma_): Change to XO mode.
>   (mma_): Change to XO/OO mode.
>   (mma_): Change to XO/OO mode.
>   (mma_): Change to XO mode.
>   (mma_): Change to XO mode.
>   * gcc/config/rs6000/predicates.md (input_operand): Allow opaque.
>   (mma_disassemble_output_operand): New predicate.
>   * gcc/config/rs6000/rs6000-builtin.def:
>   Changes to disassemble builtins.
>   * gcc/config/rs6000/rs6000-call.c (rs6000_return_in_memory):
>   Disallow __vector_pair/__vector_quad as return types.
>   (rs6000_promote_function_mode): Remove function return type
>   check because we can't test it here any more.
>   (rs6000_function_arg): Do not allow __vector_pair/__vector_quad
>   as as function arguments.
>   (rs6000_gimple_fold_mma_builtin):
>   Handle mma_disassemble_* builtins.
>   (rs6000_init_builtins): Create types for XO/OO modes.
>   * gcc/config/rs6000/rs6000-modes.def: DElete OI, XI,
>   POI, and PXI modes, and create XO and OO modes.
>   * gcc/config/rs6000/rs6000-string.c (expand_block_move):
>   Update to OO mode.
>   * gcc/config/rs6000/rs6000.c (rs6000_hard_regno_mode_ok_uncached):
>   Update for XO/OO modes.
>   (rs6000_rtx_costs): Make UNSPEC_MMA_XXSETACCZ cost 0.
>   (rs6000_modes_tieable_p): Update for XO/OO modes.
>   (rs6000_debug_reg_global): Update for XO/OO modes.
>   (rs6000_setup_reg_addr_masks): Update for XO/OO modes.
>   (rs6000_init_hard_regno_mode_ok): Update for XO/OO modes.
>   (reg_offset_addressing_ok_p): Update for XO/OO modes.
>   (rs6000_emit_move): Update for XO/OO modes.
>   (rs6000_preferred_reload_class): Update for XO/OO modes.
>   (rs6000_split_multireg_move): Update for XO/OO modes.
>   (rs6000_mangle_type): Update for opaque types.
>   (rs6000_invalid_conversion): Update for XO/OO modes.
>   * gcc/config/rs6000/rs6000.h (VECTOR_ALIGNMENT_P):
>   Update for XO/OO modes.
>   * gcc/config/rs6000/rs6000.md (RELOAD): Update for XO/OO modes.
> gcc/testsuite/
>   * gcc.target/powerpc/mma-double-test.c (main): Call abort for failure.
>   * gcc.target/powerpc/mma-single-test.c (main): Call abort for failure.
>   * gcc.target/powerpc/pr96506.c: Rename to pr96506-1.c.
>   * gcc.target/powerpc/pr96506-2.c: New test.
> ---
> gcc/config/rs6000/mma.md  | 421 ++
> gcc/config/rs6000/predicates.md   |  12 +
> gcc/config/rs6000/rs6000-builtin.def  |  14 +-
> gcc/config/rs6000/rs6000-call.c   | 142 +++---
> gcc/config/rs6000/rs6000-modes.def|  10 +-
> gcc/config/rs6000/rs6000-string.c |   6 +-
> gcc/config/rs6000/rs6000.c| 193 
> gcc/config/rs6000/rs6000.h|   3 +-
> gcc/config/rs6000/rs6000.md   |

Re: [PATCH,rs6000] Add patterns for combine to support p10 fusion

2020-11-04 Thread Aaron Sawdey via Gcc-patches
Ping.

Aaron Sawdey, Ph.D. saw...@linux.ibm.com
IBM Linux on POWER Toolchain
 

> On Oct 26, 2020, at 4:44 PM, acsaw...@linux.ibm.com wrote:
> 
> From: Aaron Sawdey 
> 
> This patch adds the first couple patterns to support p10 fusion. These
> will allow combine to create a single insn for a pair of instructions
> that that power10 can fuse and execute. These particular ones have the
> requirement that only cr0 can be used when fusing a load with a compare
> immediate of -1/0/1, so we want combine to put that requirement in, and
> if it doesn't work out later the splitter can get used.
> 
> This also adds option -mpower10-fusion which defaults on for power10 and
> will gate all these fusion patterns. In addition I have added an
> undocumented option -mpower10-fusion-ld-cmpi (which may be removed later)
> that just controls the load+compare-immediate patterns. I have make
> these default on for power10 but they are not disallowed for earlier
> processors because it is still valid code. This allows us to test the
> correctness of fusion code generation by turning it on explicitly.
> 
> The intention is to work through more patterns of this style to support
> the rest of the power10 fusion pairs.
> 
> Bootstrap and regtest looks good on ppc64le power9 with these patterns
> enabled in stage2/stage3 and for regtest. Ok for trunk?
> 
> gcc/ChangeLog:
> 
>   * config/rs6000/predicates.md: Add const_me_to_1_operand.
>   * config/rs6000/rs6000-cpus.def: Add OPTION_MASK_P10_FUSION and
>   OPTION_MASK_P10_FUSION_LD_CMPI to ISA_3_1_MASKS_SERVER.
>   * config/rs6000/rs6000-protos.h (address_ok_for_form): Add
>   prototype.
>   * config/rs6000/rs6000.c (rs6000_option_override_internal):
>   automatically set -mpower10-fusion and -mpower10-fusion-ld-cmpi
>   if target is power10.  (rs600_opt_masks): Allow -mpower10-fusion
>   in function attributes.  (address_ok_for_form): New function.
>   * config/rs6000/rs6000.h: Add MASK_P10_FUSION.
>   * config/rs6000/rs6000.md (*ld_cmpi_cr0): New
>   define_insn_and_split.
>   (*lwa_cmpdi_cr0): New define_insn_and_split.
>   (*lwa_cmpwi_cr0): New define_insn_and_split.
>   * config/rs6000/rs6000.opt: Add -mpower10-fusion
>   and -mpower10-fusion-ld-cmpi.
> ---
> gcc/config/rs6000/predicates.md   |  5 +++
> gcc/config/rs6000/rs6000-cpus.def |  6 ++-
> gcc/config/rs6000/rs6000-protos.h |  2 +
> gcc/config/rs6000/rs6000.c| 34 
> gcc/config/rs6000/rs6000.h|  1 +
> gcc/config/rs6000/rs6000.md   | 68 +++
> gcc/config/rs6000/rs6000.opt  |  8 
> 7 files changed, 123 insertions(+), 1 deletion(-)
> 
> diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md
> index 4c2fe7fa312..b75c1ddfb69 100644
> --- a/gcc/config/rs6000/predicates.md
> +++ b/gcc/config/rs6000/predicates.md
> @@ -297,6 +297,11 @@ (define_predicate "const_0_to_1_operand"
>   (and (match_code "const_int")
>(match_test "IN_RANGE (INTVAL (op), 0, 1)")))
> 
> +;; Match op = -1, op = 0, or op = 1.
> +(define_predicate "const_m1_to_1_operand"
> +  (and (match_code "const_int")
> +   (match_test "IN_RANGE (INTVAL (op), -1, 1)")))
> +
> ;; Match op = 0..3.
> (define_predicate "const_0_to_3_operand"
>   (and (match_code "const_int")
> diff --git a/gcc/config/rs6000/rs6000-cpus.def 
> b/gcc/config/rs6000/rs6000-cpus.def
> index 8d2c1ffd6cf..3e65289d8df 100644
> --- a/gcc/config/rs6000/rs6000-cpus.def
> +++ b/gcc/config/rs6000/rs6000-cpus.def
> @@ -82,7 +82,9 @@
> 
> #define ISA_3_1_MASKS_SERVER  (ISA_3_0_MASKS_SERVER   \
>| OPTION_MASK_POWER10  \
> -  | OTHER_POWER10_MASKS)
> +  | OTHER_POWER10_MASKS  \
> +  | OPTION_MASK_P10_FUSION   \
> +  | OPTION_MASK_P10_FUSION_LD_CMPI)
> 
> /* Flags that need to be turned off if -mno-power9-vector.  */
> #define OTHER_P9_VECTOR_MASKS (OPTION_MASK_FLOAT128_HW\
> @@ -129,6 +131,8 @@
>| OPTION_MASK_FLOAT128_KEYWORD \
>| OPTION_MASK_FPRND\
>| OPTION_MASK_POWER10  \
> +  | OPTION_MASK_P10_FUSION   \
> +  | OPTION_MASK_P10_FUSION_LD_CMPI   \
>| OPTION_MASK_HTM  \
>  

Re: [PATCH] [PATCH] PR rtl-optimization/96791 Check precision of partial modes

2020-11-02 Thread Aaron Sawdey via Gcc-patches
Ping.

So, this has sat for a while and it’s getting close to the end of stage1 now. I 
don’t see that we're any closer to a solution that allows us to use POImode 
without risking this ICE. I had to disable the use of VSX vector pair 
loads/stores in inline expansion of memcpy/memmove do avoid it. There is no 
solution like that for the MMA builtins that use POImode and are (in theory) 
exposed to the same problem.

So I ask again, how can we tell extract_low_bits() that POImode is off limits 
to its prying fingers?

Thanks,
   Aaron


Aaron Sawdey, Ph.D. saw...@linux.ibm.com
IBM Linux on POWER Toolchain
 

> On Sep 14, 2020, at 10:47 AM, Segher Boessenkool  
> wrote:
> 
> On Mon, Sep 14, 2020 at 09:46:11AM +0200, Richard Biener wrote:
>> On Fri, Sep 11, 2020 at 4:18 PM Segher Boessenkool
>>  wrote:
>>> Until 2014 (and documented just days ago ;-) ) all bits of a partial
>>> integer mode were considered unknown.
>> 
>> All bits or all bits outside of its precision?  I hope the latter ;)
> 
> All bits.  Many things in GCC still follow that older definition, btw.
> 
>>> I have looked at a lot of it in
>>> our code the past weeks, and we still treat it like that in most places.
> 
> Oh I said that already, heh.
> 
>>> We now see bootstrap problems if we use POImode in some contexts (that's
>>> this PR96791).  POImode can only live in pairs of VSX registers; taking
>>> a subreg of POImode that would not be valid on one VSX register is not
>>> okay.
>> 
>> I guess the same applies to i?86 DImode living in two gpr regs.  Or any
>> multi-reg pseudo.  It certainly shouldn't be dependent on whether we're
>> dealing with a partial integer mode or not.
> 
> If some mode can be in GPRs, then taking subregs of it works fine.
> 
>>> Maybe we are missing some hooks or macros?
>> 
>> So this problem must be "solved" in some way already.  How do we asses
>> subreg validity?  Through recog in the end?
> 
> No, we ICE.  See the PR?  (PR96791).
> 
> 
> Segher



Ping: [PATCH] PR rtl-optimization/96791 Check precision of partial modes

2020-10-05 Thread Aaron Sawdey via Gcc-patches
Not exactly a patch ping, but I was hoping we could re-engage the discussion on 
this and figure out how we can make POImode work for powerpc.

How does x86 solve this? There was some suggestion that it has some similar 
situations? 

Thanks,
   

Aaron Sawdey, Ph.D. saw...@linux.ibm.com
IBM Linux on POWER Toolchain
 

> On Sep 9, 2020, at 1:27 PM, Aaron Sawdey  wrote:
> 
> Now that the documentation for partial modes says they have a known
> number of bits of precision, would it make sense for extract_low_bits to
> check this before attempting to extract the bits?
> 
> This would solve the problem we have been having with POImode and
> extract_low_bits -- DSE tries to use it to extract part of a POImode
> register used in a previous store. We do not want to supply any patterns
> to make POImode (or OImode) used like a regular integer mode.
> 
> This patch adds such a check, and sets the precision of POImode to one
> bit, which resolves the problems of PR/96791 for ppc64 target.
> 
> Bootstrap passes on ppc64le and x86_64.
> 
> Thanks,
>   Aaron
> 
> gcc/ChangeLog:
> 
>   * config/rs6000/rs6000-modes.def (POImode): Change precision.
>   * expmed.c (extract_low_bits): Check precision.
> ---
> gcc/config/rs6000/rs6000-modes.def | 2 +-
> gcc/expmed.c   | 3 +++
> 2 files changed, 4 insertions(+), 1 deletion(-)
> 
> diff --git a/gcc/config/rs6000/rs6000-modes.def 
> b/gcc/config/rs6000/rs6000-modes.def
> index ddb218b3fba..aa7d60dd835 100644
> --- a/gcc/config/rs6000/rs6000-modes.def
> +++ b/gcc/config/rs6000/rs6000-modes.def
> @@ -90,5 +90,5 @@ INT_MODE (OI, 32);
> INT_MODE (XI, 64);
> 
> /* Modes used by __vector_pair and __vector_quad.  */
> -PARTIAL_INT_MODE (OI, 256, POI); /* __vector_pair.  */
> +PARTIAL_INT_MODE (OI, 1, POI);   /* __vector_pair.  */
> PARTIAL_INT_MODE (XI, 512, PXI);  /* __vector_quad.  */
> diff --git a/gcc/expmed.c b/gcc/expmed.c
> index d34f0fb0b54..23ca181afa6 100644
> --- a/gcc/expmed.c
> +++ b/gcc/expmed.c
> @@ -2396,6 +2396,9 @@ extract_low_bits (machine_mode mode, machine_mode 
> src_mode, rtx src)
>   if (GET_MODE_CLASS (mode) == MODE_CC || GET_MODE_CLASS (src_mode) == 
> MODE_CC)
> return NULL_RTX;
> 
> +  if (known_lt (GET_MODE_PRECISION (src_mode), GET_MODE_BITSIZE (mode)))
> +return NULL_RTX;
> +
>   if (known_eq (GET_MODE_BITSIZE (mode), GET_MODE_BITSIZE (src_mode))
>   && targetm.modes_tieable_p (mode, src_mode))
> {
> -- 
> 2.17.1
> 



[PATCH][PR96791] disable POImode ld/st for memcpy [committed]

2020-09-10 Thread Aaron Sawdey via Gcc-patches
This is a (hopefully temporary) fix to PR96791. This will make
the default be -mno-block-ops-vector-pair even on power10, so we will
not hit the issue of DSE trying to truncate a POImode register. I am
still concerned it will be possible to hit this because the MMA builtins
will also generate POImode stores, but I think any example of that will
be somewhat more contrived.

Bootstrap and regression tests passed on ppc64le power9.
Pre-approved by Segher, posting after commit.

gcc/ChangeLog:

* config/rs6000/rs6000.c (rs6000_option_override_internal):
Change default.
---
 gcc/config/rs6000/rs6000.c | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index f6a3ff6f089..9908830b07a 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -4020,10 +4020,11 @@ rs6000_option_override_internal (bool global_init_p)
 
   if (!(rs6000_isa_flags_explicit & OPTION_MASK_BLOCK_OPS_VECTOR_PAIR))
 {
-  if (TARGET_MMA && TARGET_EFFICIENT_UNALIGNED_VSX)
-   rs6000_isa_flags |= OPTION_MASK_BLOCK_OPS_VECTOR_PAIR;
-  else
-   rs6000_isa_flags &= ~OPTION_MASK_BLOCK_OPS_VECTOR_PAIR;
+  /* When the POImode issues of PR96791 are resolved, then we can
+once again enable use of vector pair for memcpy/memmove on
+P10 if we have TARGET_MMA.  For now we make it disabled by
+default for all targets.  */
+  rs6000_isa_flags &= ~OPTION_MASK_BLOCK_OPS_VECTOR_PAIR;
 }
 
   /* Use long double size to select the appropriate long double.  We use
-- 
2.17.1



Re: [PATCH] [PATCH] PR rtl-optimization/96791 Check precision of partial modes

2020-09-10 Thread Aaron Sawdey via Gcc-patches
So, would it be legitimate for extract_low_bits to query if the truncate 
pattern it will likely use is actually available? 

Aaron Sawdey, Ph.D. saw...@linux.ibm.com
IBM Linux on POWER Toolchain
 

> On Sep 10, 2020, at 10:10 AM, Segher Boessenkool  
> wrote:
> 
> Hi!
> 
> On Thu, Sep 10, 2020 at 04:33:30PM +0200, Richard Biener wrote:
>> On Thu, Sep 10, 2020 at 4:22 PM Aaron Sawdey  wrote:
>>> If it feels like a hack, that would because it is a hack.
>>> 
>>> What I’d really like to discuss is how to accomplish the real goal: keep 
>>> anything from trying to do other operations (zero/sign extend for one) to 
>>> POImode.
>>> 
>>> Is there an existing mechanism for this?
>> 
>> Not that I know, but somehow x86 gets away with OImode and XImode without
>> providing too many patterns for those.
> 
> What we were seeing is DSE (of all things!) tries to extract a DImode
> from a POImode (and expects that insn to exist!)  That is no good.
> 
> 
> Segher



Re: [PATCH] [PATCH] PR rtl-optimization/96791 Check precision of partial modes

2020-09-10 Thread Aaron Sawdey via Gcc-patches
If it feels like a hack, that would because it is a hack.

What I’d really like to discuss is how to accomplish the real goal: keep 
anything from trying to do other operations (zero/sign extend for one) to 
POImode.

Is there an existing mechanism for this?

Thanks,
Aaron

Aaron Sawdey, Ph.D. saw...@linux.ibm.com
IBM Linux on POWER Toolchain
 

> On Sep 10, 2020, at 4:36 AM, Richard Biener  
> wrote:
> 
> On Wed, Sep 9, 2020 at 8:28 PM Aaron Sawdey via Gcc-patches
>  wrote:
>> 
>> Now that the documentation for partial modes says they have a known
>> number of bits of precision, would it make sense for extract_low_bits to
>> check this before attempting to extract the bits?
>> 
>> This would solve the problem we have been having with POImode and
>> extract_low_bits -- DSE tries to use it to extract part of a POImode
>> register used in a previous store. We do not want to supply any patterns
>> to make POImode (or OImode) used like a regular integer mode.
>> 
>> This patch adds such a check, and sets the precision of POImode to one
>> bit, which resolves the problems of PR/96791 for ppc64 target.
> 
> How many bits are you actually storing in POImode?  If you say it's
> precision is 1 then the middle-end might be tempted to ignore any
> changes to the upper bits.  You now probably say "but we don't have
> any such interesting operation done on POImode" but still ... it feels
> like a hack.
> 
> Richard.
> 
>> Bootstrap passes on ppc64le and x86_64.
>> 
>> Thanks,
>>   Aaron
>> 
>> gcc/ChangeLog:
>> 
>>* config/rs6000/rs6000-modes.def (POImode): Change precision.
>>* expmed.c (extract_low_bits): Check precision.
>> ---
>> gcc/config/rs6000/rs6000-modes.def | 2 +-
>> gcc/expmed.c   | 3 +++
>> 2 files changed, 4 insertions(+), 1 deletion(-)
>> 
>> diff --git a/gcc/config/rs6000/rs6000-modes.def 
>> b/gcc/config/rs6000/rs6000-modes.def
>> index ddb218b3fba..aa7d60dd835 100644
>> --- a/gcc/config/rs6000/rs6000-modes.def
>> +++ b/gcc/config/rs6000/rs6000-modes.def
>> @@ -90,5 +90,5 @@ INT_MODE (OI, 32);
>> INT_MODE (XI, 64);
>> 
>> /* Modes used by __vector_pair and __vector_quad.  */
>> -PARTIAL_INT_MODE (OI, 256, POI);   /* __vector_pair.  */
>> +PARTIAL_INT_MODE (OI, 1, POI); /* __vector_pair.  */
>> PARTIAL_INT_MODE (XI, 512, PXI);   /* __vector_quad.  */
>> diff --git a/gcc/expmed.c b/gcc/expmed.c
>> index d34f0fb0b54..23ca181afa6 100644
>> --- a/gcc/expmed.c
>> +++ b/gcc/expmed.c
>> @@ -2396,6 +2396,9 @@ extract_low_bits (machine_mode mode, machine_mode 
>> src_mode, rtx src)
>>   if (GET_MODE_CLASS (mode) == MODE_CC || GET_MODE_CLASS (src_mode) == 
>> MODE_CC)
>> return NULL_RTX;
>> 
>> +  if (known_lt (GET_MODE_PRECISION (src_mode), GET_MODE_BITSIZE (mode)))
>> +return NULL_RTX;
>> +
>>   if (known_eq (GET_MODE_BITSIZE (mode), GET_MODE_BITSIZE (src_mode))
>>   && targetm.modes_tieable_p (mode, src_mode))
>> {
>> --
>> 2.17.1
>> 



[PATCH] [PATCH] PR rtl-optimization/96791 Check precision of partial modes

2020-09-09 Thread Aaron Sawdey via Gcc-patches
Now that the documentation for partial modes says they have a known
number of bits of precision, would it make sense for extract_low_bits to
check this before attempting to extract the bits?

This would solve the problem we have been having with POImode and
extract_low_bits -- DSE tries to use it to extract part of a POImode
register used in a previous store. We do not want to supply any patterns
to make POImode (or OImode) used like a regular integer mode.

This patch adds such a check, and sets the precision of POImode to one
bit, which resolves the problems of PR/96791 for ppc64 target.

Bootstrap passes on ppc64le and x86_64.

Thanks,
   Aaron

gcc/ChangeLog:

* config/rs6000/rs6000-modes.def (POImode): Change precision.
* expmed.c (extract_low_bits): Check precision.
---
 gcc/config/rs6000/rs6000-modes.def | 2 +-
 gcc/expmed.c   | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/gcc/config/rs6000/rs6000-modes.def 
b/gcc/config/rs6000/rs6000-modes.def
index ddb218b3fba..aa7d60dd835 100644
--- a/gcc/config/rs6000/rs6000-modes.def
+++ b/gcc/config/rs6000/rs6000-modes.def
@@ -90,5 +90,5 @@ INT_MODE (OI, 32);
 INT_MODE (XI, 64);
 
 /* Modes used by __vector_pair and __vector_quad.  */
-PARTIAL_INT_MODE (OI, 256, POI);   /* __vector_pair.  */
+PARTIAL_INT_MODE (OI, 1, POI); /* __vector_pair.  */
 PARTIAL_INT_MODE (XI, 512, PXI);   /* __vector_quad.  */
diff --git a/gcc/expmed.c b/gcc/expmed.c
index d34f0fb0b54..23ca181afa6 100644
--- a/gcc/expmed.c
+++ b/gcc/expmed.c
@@ -2396,6 +2396,9 @@ extract_low_bits (machine_mode mode, machine_mode 
src_mode, rtx src)
   if (GET_MODE_CLASS (mode) == MODE_CC || GET_MODE_CLASS (src_mode) == MODE_CC)
 return NULL_RTX;
 
+  if (known_lt (GET_MODE_PRECISION (src_mode), GET_MODE_BITSIZE (mode)))
+return NULL_RTX;
+
   if (known_eq (GET_MODE_BITSIZE (mode), GET_MODE_BITSIZE (src_mode))
   && targetm.modes_tieable_p (mode, src_mode))
 {
-- 
2.17.1



[committed] rs6000: unaligned VSX in memcpy/memmove expansion

2020-08-18 Thread Aaron Sawdey via Gcc-patches
I've modified slightly per Will & Segher's comments, re-regstrapped and
posting what I've actually committed.

  Aaron

This patch adds a few new instructions to inline expansion of
memcpy/memmove. Generation of all these are controlled by
the option -mblock-ops-unaligned-vsx which is set on by default if the
target has TARGET_EFFICIENT_UNALIGNED_VSX.
 * unaligned vsx load/store (V2DImode)
 * unaligned vsx pair load/store (POImode) which is also controlled
   by -mblock-ops-vector-pair in case it is not wanted at some point.
   The default for -mblock-ops-vector-pair is for it to be on if the
   target has TARGET_MMA and TARGET_EFFICIENT_UNALIGNED_VSX. This is
   redundant, but nice for the future to clearly specify what is
   required.
 * unaligned vsx lxvl/stxvl but generally only to do the remainder
   of a copy/move we stated with some vsx loads/stores, and also prefer
   to use lb/lh/lw/ld if the remainder is 1/2/4/8 bytes.

Testing of this is actually accomplished by gcc.dg/memcmp-1.c which does
two memcpy() for each memcmp(). If the memcpy() calls don't do the right
thing then the memcmp() will fail unexpectedly.

gcc/ChangeLog:

* config/rs6000/rs6000-string.c (gen_lxvl_stxvl_move):
Helper function.
(expand_block_move): Add lxvl/stxvl, vector pair, and
unaligned VSX.
* config/rs6000/rs6000.c (rs6000_option_override_internal):
Default value for -mblock-ops-vector-pair.
* config/rs6000/rs6000.opt: Add -mblock-ops-vector-pair.
---
 gcc/config/rs6000/rs6000-string.c | 103 ++
 gcc/config/rs6000/rs6000.c|  14 +++-
 gcc/config/rs6000/rs6000.opt  |   4 ++
 3 files changed, 105 insertions(+), 16 deletions(-)

diff --git a/gcc/config/rs6000/rs6000-string.c 
b/gcc/config/rs6000/rs6000-string.c
index c35d93180ca..82cc24ecdda 100644
--- a/gcc/config/rs6000/rs6000-string.c
+++ b/gcc/config/rs6000/rs6000-string.c
@@ -2708,6 +2708,32 @@ gen_lvx_v4si_move (rtx dest, rtx src)
 return gen_altivec_lvx_v4si_internal (dest, src);
 }
 
+static rtx
+gen_lxvl_stxvl_move (rtx dest, rtx src, int length)
+{
+  gcc_assert (MEM_P (dest) ^ MEM_P (src));
+  gcc_assert (GET_MODE (dest) == V16QImode && GET_MODE (src) == V16QImode);
+  gcc_assert (length <= 16);
+
+  bool is_store = MEM_P (dest);
+  rtx addr;
+
+  /* If the address form is not a simple register, make it so.  */
+  if (is_store)
+addr = XEXP (dest, 0);
+  else
+addr = XEXP (src, 0);
+
+  if (!REG_P (addr))
+addr = force_reg (Pmode, addr);
+
+  rtx len = force_reg (DImode, gen_int_mode (length, DImode));
+  if (is_store)
+return gen_stxvl (src, addr, len);
+  else
+return gen_lxvl (dest, addr, len);
+}
+
 /* Expand a block move operation, and return 1 if successful.  Return 0
if we should let the compiler generate normal code.
 
@@ -2750,18 +2776,56 @@ expand_block_move (rtx operands[], bool might_overlap)
   if (bytes > rs6000_block_move_inline_limit)
 return 0;
 
+  int orig_bytes = bytes;
   for (offset = 0; bytes > 0; offset += move_bytes, bytes -= move_bytes)
 {
   union {
-   rtx (*movmemsi) (rtx, rtx, rtx, rtx);
rtx (*mov) (rtx, rtx);
+   rtx (*movlen) (rtx, rtx, int);
   } gen_func;
   machine_mode mode = BLKmode;
   rtx src, dest;
-
-  /* Altivec first, since it will be faster than a string move
-when it applies, and usually not significantly larger.  */
-  if (TARGET_ALTIVEC && bytes >= 16 && align >= 128)
+  bool move_with_length = false;
+
+  /* Use POImode for paired vsx load/store.  Use V2DI for single
+unaligned vsx load/store, for consistency with what other
+expansions (compare) already do, and so we can use lxvd2x on
+p8.  Order is VSX pair unaligned, VSX unaligned, Altivec, VSX
+with length < 16 (if allowed), then gpr load/store.  */
+
+  if (TARGET_MMA && TARGET_BLOCK_OPS_UNALIGNED_VSX
+ && TARGET_BLOCK_OPS_VECTOR_PAIR
+ && bytes >= 32
+ && (align >= 256 || !STRICT_ALIGNMENT))
+   {
+ move_bytes = 32;
+ mode = POImode;
+ gen_func.mov = gen_movpoi;
+   }
+  else if (TARGET_POWERPC64 && TARGET_BLOCK_OPS_UNALIGNED_VSX
+  && VECTOR_MEM_VSX_P (V2DImode)
+  && bytes >= 16 && (align >= 128 || !STRICT_ALIGNMENT))
+   {
+ move_bytes = 16;
+ mode = V2DImode;
+ gen_func.mov = gen_vsx_movv2di_64bit;
+   }
+  else if (TARGET_BLOCK_OPS_UNALIGNED_VSX
+  && TARGET_POWER10 && bytes < 16
+  && orig_bytes > 16
+  && !(bytes == 1 || bytes == 2
+   || bytes == 4 || bytes == 8)
+  && (align >= 128 || !STRICT_ALIGNMENT))
+   {
+ /* Only use lxvl/stxvl if it could replace multiple ordinary
+loads+stores.  Also don't use it unless we likely already
+did one vsx copy so we aren't mixing gpr and vsx.  */
+ move_bytes = bytes;
+

[PATCH] rs6000: unaligned VSX in memcpy/memmove expansion

2020-08-14 Thread Aaron Sawdey via Gcc-patches
This patch adds a few new instructions to inline expansion of
memcpy/memmove. Generation of all these is controlled by
the option -mblock-ops-unaligned-vsx which is set on by default if the
target has TARGET_EFFICIENT_UNALIGNED_VSX.
 * unaligned vsx load/store (V2DImode)
 * unaligned vsx pair load/store (POImode) which is also controlled
   by -mblock-ops-vector-pair in case it is not wanted at some point.
   The default for this option is also for it to be on if the target has
   TARGET_EFFICIENT_UNALIGNED_VSX.
 * unaligned vsx lxvl/stxvl but generally only to do the remainder
   of a copy/move we stated with some vsx loads/stores, and also prefer
   to use lb/lh/lw/ld if the remainder is 1/2/4/8 bytes.

Testing of this is actually accomplished by gcc.dg/memcmp-1.c which does
two memcpy() for each memcmp(). If the memcpy() calls don't do the right
thing then the memcmp() will fail unexpectedly.

Regstrap passed on ppc64le power9 and the memcmp-1.c test passes on
power10 simulator, ok for trunk?

Thanks!
Aaron

gcc/ChangeLog:

* config/rs6000/rs6000-string.c (gen_lxvl_stxvl_move):
Helper function.
(expand_block_move): Add lxvl/stxvl, vector pair, and
unaligned VSX.
* config/rs6000/rs6000.c (rs6000_option_override_internal):
Default value for -mblock-ops-vector-pair.
* config/rs6000/rs6000.opt: Add -mblock-ops-vector-pair.
---
 gcc/config/rs6000/rs6000-string.c | 105 ++
 gcc/config/rs6000/rs6000.c|  14 +++-
 gcc/config/rs6000/rs6000.opt  |   4 ++
 3 files changed, 107 insertions(+), 16 deletions(-)

diff --git a/gcc/config/rs6000/rs6000-string.c 
b/gcc/config/rs6000/rs6000-string.c
index c35d93180ca..ce6db2ba14d 100644
--- a/gcc/config/rs6000/rs6000-string.c
+++ b/gcc/config/rs6000/rs6000-string.c
@@ -2708,6 +2708,36 @@ gen_lvx_v4si_move (rtx dest, rtx src)
 return gen_altivec_lvx_v4si_internal (dest, src);
 }
 
+static rtx
+gen_lxvl_stxvl_move (rtx dest, rtx src, int length)
+{
+  gcc_assert (MEM_P (dest) ^ MEM_P (src));
+  gcc_assert (GET_MODE (dest) == V16QImode && GET_MODE (src) == V16QImode);
+  gcc_assert (length <= 16);
+
+  bool is_store = MEM_P (dest);
+
+  /* If the address form is not a simple register, make it so.  */
+  if (is_store)
+{
+  dest = XEXP (dest, 0);
+  if (!REG_P (dest))
+   dest = force_reg (Pmode, dest);
+}
+  else
+{
+  src = XEXP (src, 0);
+  if (!REG_P (src))
+   src = force_reg (Pmode, src);
+}
+
+  rtx len = force_reg (DImode, gen_int_mode (length, DImode));
+  if (is_store)
+return gen_stxvl (src, dest, len);
+  else
+return  gen_lxvl (dest, src, len);
+}
+
 /* Expand a block move operation, and return 1 if successful.  Return 0
if we should let the compiler generate normal code.
 
@@ -2750,18 +2780,57 @@ expand_block_move (rtx operands[], bool might_overlap)
   if (bytes > rs6000_block_move_inline_limit)
 return 0;
 
+  int orig_bytes = bytes;
   for (offset = 0; bytes > 0; offset += move_bytes, bytes -= move_bytes)
 {
   union {
-   rtx (*movmemsi) (rtx, rtx, rtx, rtx);
rtx (*mov) (rtx, rtx);
+   rtx (*movlen) (rtx, rtx, int);
   } gen_func;
   machine_mode mode = BLKmode;
   rtx src, dest;
-
-  /* Altivec first, since it will be faster than a string move
-when it applies, and usually not significantly larger.  */
-  if (TARGET_ALTIVEC && bytes >= 16 && align >= 128)
+  bool move_with_length = false;
+
+  /* Use POImode for paired vsx load/store.  Use V2DI for single
+unaligned vsx load/store, for consistency with what other
+expansions (compare) already do, and so we can use lxvd2x on
+p8.  Order is VSX pair unaligned, VSX unaligned, Altivec, vsx
+with length < 16 (if allowed), then smaller gpr
+load/store.  */
+
+  if (TARGET_MMA && TARGET_BLOCK_OPS_UNALIGNED_VSX
+ && TARGET_BLOCK_OPS_VECTOR_PAIR
+ && bytes >= 32
+ && (align >= 256 || !STRICT_ALIGNMENT))
+   {
+ move_bytes = 32;
+ mode = POImode;
+ gen_func.mov = gen_movpoi;
+   }
+  else if (TARGET_POWERPC64 && TARGET_BLOCK_OPS_UNALIGNED_VSX
+  && VECTOR_MEM_VSX_P (V2DImode)
+  && bytes >= 16 && (align >= 128 || !STRICT_ALIGNMENT))
+   {
+ move_bytes = 16;
+ mode = V2DImode;
+ gen_func.mov = gen_vsx_movv2di_64bit;
+   }
+  else if (TARGET_BLOCK_OPS_UNALIGNED_VSX
+  && TARGET_POWER10 && bytes < 16
+  && orig_bytes > 16
+  && !(bytes == 1 || bytes == 2
+   || bytes == 4 || bytes == 8)
+  && (align >= 128 || !STRICT_ALIGNMENT))
+   {
+ /* Only use lxvl/stxvl if it could replace multiple ordinary
+loads+stores.  Also don't use it unless we likely already
+did one vsx copy so we aren't mixing gpr and vsx.  */
+ move_bytes = bytes;
+ 

[PATCH] rs6000: clean up testsuite power10_hw check

2020-07-13 Thread Aaron Sawdey via Gcc-patches
Because the check for power10_hw is not called
check_effective_target_power10_hw, it needs to be looked
for by is-effective-target-keyword. Also reorder things
in is-effective-target to put power10_hw with the other
ppc stuff.

These little fixes for power10 dejagnu support were pre-approved
for trunk and 10 by Segher. Posting before pushing.

  Aaron

gcc/testsuite/

* lib/target-supports.exp (is-effective-target):
Reorder to put powerpc stuff together.
(is-effective-target-keyword): Add power10_hw.
---
 gcc/testsuite/lib/target-supports.exp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/gcc/testsuite/lib/target-supports.exp 
b/gcc/testsuite/lib/target-supports.exp
index 2e4c696fdd1..57eed3012b9 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -7851,6 +7851,7 @@ proc is-effective-target { arg } {
  "p8vector_hw"{ set selected [check_p8vector_hw_available] }
  "p9vector_hw"{ set selected [check_p9vector_hw_available] }
  "p9modulo_hw"{ set selected [check_p9modulo_hw_available] }
+ "power10_hw" { set selected [check_power10_hw_available] }
  "ppc_float128_sw" { set selected [check_ppc_float128_sw_available] }
  "ppc_float128_hw" { set selected [check_ppc_float128_hw_available] }
  "ppc_recip_hw"   { set selected [check_ppc_recip_hw_available] }
@@ -7861,7 +7862,6 @@ proc is-effective-target { arg } {
  "named_sections" { set selected [check_named_sections_available] }
  "gc_sections"{ set selected [check_gc_sections_available] }
  "cxa_atexit" { set selected [check_cxa_atexit_available] }
- "power10_hw" { set selected [check_power10_hw_available] }
  default  { error "unknown effective target keyword `$arg'" }
}
 }
@@ -7883,6 +7883,7 @@ proc is-effective-target-keyword { arg } {
  "p8vector_hw"{ return 1 }
  "p9vector_hw"{ return 1 }
  "p9modulo_hw"{ return 1 }
+ "power10_hw" { return 1 }
  "ppc_float128_sw" { return 1 }
  "ppc_float128_hw" { return 1 }
  "ppc_recip_hw"   { return 1 }
-- 
2.17.1



[PATCH] rs6000: add effective-target test ppc_mma_hw

2020-07-10 Thread Aaron Sawdey via Gcc-patches
Add a test for dejagnu to determine if execution of MMA instructions is
supported in the test environment. Add an execution test to make sure
that __builtin_cpu_supports("mma") is true if we can execute MMA
instructions.

OK for trunk and backport to 10?

Thanks!
   Aaron

gcc/testsuite/

* lib/target-supports.exp (check_ppc_mma_hw_available):
New function.
(is-effective-target): Add ppc_mma_hw.
(is-effective-target-keyword): Add ppc_mma_hw.
* gcc.target/powerpc/mma-supported.c: New file.
* gcc.target/powerpc/mma-single-test.c: Require ppc_mma_hw.
* gcc.target/powerpc/mma-double-test.c: Require ppc_mma_hw.
---
 .../gcc.target/powerpc/mma-double-test.c  |  1 +
 .../gcc.target/powerpc/mma-single-test.c  |  1 +
 .../gcc.target/powerpc/mma-supported.c| 24 +
 gcc/testsuite/lib/target-supports.exp | 27 +++
 4 files changed, 53 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/mma-supported.c

diff --git a/gcc/testsuite/gcc.target/powerpc/mma-double-test.c 
b/gcc/testsuite/gcc.target/powerpc/mma-double-test.c
index 9ba0010978f..ac84ae30004 100755
--- a/gcc/testsuite/gcc.target/powerpc/mma-double-test.c
+++ b/gcc/testsuite/gcc.target/powerpc/mma-double-test.c
@@ -1,5 +1,6 @@
 /* { dg-do run } */
 /* { dg-require-effective-target power10_hw } */
+/* { dg-require-effective-target ppc_mma_hw } */
 /* { dg-options "-mdejagnu-cpu=power10 -O2" } */
 
 #include 
diff --git a/gcc/testsuite/gcc.target/powerpc/mma-single-test.c 
b/gcc/testsuite/gcc.target/powerpc/mma-single-test.c
index aa71fa7f0af..15369a64025 100755
--- a/gcc/testsuite/gcc.target/powerpc/mma-single-test.c
+++ b/gcc/testsuite/gcc.target/powerpc/mma-single-test.c
@@ -1,5 +1,6 @@
 /* { dg-do run } */
 /* { dg-require-effective-target power10_hw } */
+/* { dg-require-effective-target ppc_mma_hw } */
 /* { dg-options "-mdejagnu-cpu=power10 -O2" } */
 
 #include 
diff --git a/gcc/testsuite/gcc.target/powerpc/mma-supported.c 
b/gcc/testsuite/gcc.target/powerpc/mma-supported.c
new file mode 100644
index 000..64943e362a8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/mma-supported.c
@@ -0,0 +1,24 @@
+/* { dg-do run } */
+/* { dg-require-effective-target ppc_mma_hw } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
+
+/* This test will only run when the ppc_mma_hw test passes.  If that
+   test passes, then we expect to see that mma feature is supported.
+   If this is not the case, then the test environment has problems. */
+
+#include 
+#include 
+
+int
+main (int argc, char *argv[])
+{
+  int ret = 0;
+#ifdef __BUILTIN_CPU_SUPPORTS__
+  if ( !__builtin_cpu_supports ("mma"))
+{
+  printf ("Error: __builtin_cpu_supports says mma not supported, but 
ppc_mma_hw test passed.\n");
+  ret++;
+}
+#endif
+  return ret;
+}
diff --git a/gcc/testsuite/lib/target-supports.exp 
b/gcc/testsuite/lib/target-supports.exp
index aeb0351073d..04f6db53eca 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -2234,6 +2234,31 @@ proc check_power10_hw_available { } {
 }]
 }
 
+# Return 1 if the target supports executing MMA instructions, 0 otherwise.
+# Cache the result.  It is assumed that if a simulator does not support the
+# MMA instructions, that it will generate an error and this test will fail.
+
+proc check_ppc_mma_hw_available { } {
+return [check_cached_effective_target ppc_mma_hw_available {
+   check_runtime_nocache ppc_mma_hw_available {
+   #include 
+   typedef double v4sf_t __attribute__ ((vector_size (16)));
+
+   int main()
+   {
+   __vector_quad acc0;
+   v4sf_t result[4];
+   result[0][0] = 1.0;
+   __builtin_mma_xxsetaccz ();
+   __builtin_mma_disassemble_acc (result, );
+   if ( result[0][0] != 0.0 )
+   return 1;
+   return 0;
+   }
+   } "-mcpu=power10"
+}]
+}
+
 # Return 1 if the target supports executing __float128 on PowerPC via software
 # emulation, 0 otherwise.  Cache the result.
 
@@ -7836,6 +7861,7 @@ proc is-effective-target { arg } {
  "gc_sections"{ set selected [check_gc_sections_available] }
  "cxa_atexit" { set selected [check_cxa_atexit_available] }
  "power10_hw" { set selected [check_power10_hw_available] }
+ "ppc_mma_hw" { set selected [check_ppc_mma_hw_available] }
  default  { error "unknown effective target keyword `$arg'" }
}
 }
@@ -7865,6 +7891,7 @@ proc is-effective-target-keyword { arg } {
  "named_sections" { return 1 }
  "gc_sections"{ return 1 }
  "cxa_atexit" { return 1 }
+ "ppc_mma_hw" { return 1 }
  default  { return 0 }
}
 }
-- 
2.17.1



[PATCH] rs6000: Add execution tests for mma builtins [v4]

2020-07-10 Thread Aaron Sawdey via Gcc-patches
This patch adds execution tests that use the MMA builtins and
check for the right answer, and new tests that checks whether
__builtin_cpu_supports and __builtin_cpu_is return sane
answers for power10.

I've now cleaned up and separated things out so there are 4 test cases:
* MMA single precision execution test
* MMA double precision execution test
* test that if effective-target is power10_hw, __builtin_cpu_is("power10")
  is also true.
* test that if effective-target is power10_hw,
  __builtin_cpu_supports("arch_3_1") is also true.

This establishes that the test environment correctly identifies itself,
and that it can execute MMA code and get the right answer.

A future patch will add an effective-target test for powerpc_mma_hw,
which these mma tests will also need to check for.

OK for trunk and backport to 10?

2020-06-30  Rajalakshmi Srinivasaraghavan  
Aaron Sawdey  

gcc/testsuite/
* gcc.target/powerpc/p10-identify.c: New file.
* gcc.target/powerpc/p10-arch31.c: New file.
* gcc.target/powerpc/mma-single-test.c: New file.
* gcc.target/powerpc/mma-double-test.c: New file.
---
 .../gcc.target/powerpc/mma-double-test.c  | 185 +
 .../gcc.target/powerpc/mma-single-test.c  | 193 ++
 gcc/testsuite/gcc.target/powerpc/p10-arch31.c |  25 +++
 .../gcc.target/powerpc/p10-identify.c |  26 +++
 4 files changed, 429 insertions(+)
 create mode 100755 gcc/testsuite/gcc.target/powerpc/mma-double-test.c
 create mode 100755 gcc/testsuite/gcc.target/powerpc/mma-single-test.c
 create mode 100644 gcc/testsuite/gcc.target/powerpc/p10-arch31.c
 create mode 100644 gcc/testsuite/gcc.target/powerpc/p10-identify.c

diff --git a/gcc/testsuite/gcc.target/powerpc/mma-double-test.c 
b/gcc/testsuite/gcc.target/powerpc/mma-double-test.c
new file mode 100755
index 000..9ba0010978f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/mma-double-test.c
@@ -0,0 +1,185 @@
+/* { dg-do run } */
+/* { dg-require-effective-target power10_hw } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
+
+#include 
+#include 
+#include 
+
+typedef unsigned char vec_t __attribute__ ((vector_size (16)));
+typedef double v4sf_t __attribute__ ((vector_size (16)));
+#define SAVE_ACC(ACC, ldc, J)  \
+ __builtin_mma_disassemble_acc (result, ACC); \
+ rowC = (v4sf_t *) [0*ldc+J]; \
+  rowC[0] += result[3] ; \
+  rowC = (v4sf_t *) [1*ldc+J]; \
+  rowC[0] += result[2] ; \
+  rowC = (v4sf_t *) [2*ldc+J]; \
+  rowC[0] += result[1] ; \
+  rowC = (v4sf_t *) [3*ldc+J]; \
+ rowC[0] += result[0] ;
+
+void
+MMA (int m, int n, int k, double *A, double *B, double *C)
+{
+  __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
+  v4sf_t result[4];
+  v4sf_t *rowC;
+  for (int l = 0; l < n; l += 4)
+{
+  double *CO;
+  double *AO;
+  AO = A;
+  CO = C;
+  C += m * 4;
+  for (int j = 0; j < m; j += 16)
+   {
+ double *BO = B;
+ __builtin_mma_xxsetaccz ();
+ __builtin_mma_xxsetaccz ();
+ __builtin_mma_xxsetaccz ();
+ __builtin_mma_xxsetaccz ();
+ __builtin_mma_xxsetaccz ();
+ __builtin_mma_xxsetaccz ();
+ __builtin_mma_xxsetaccz ();
+ __builtin_mma_xxsetaccz ();
+ unsigned long i;
+
+ for (i = 0; i < k; i++)
+   {
+ vec_t *rowA = (vec_t *) & AO[i * 16];
+ __vector_pair rowB;
+ vec_t *rb = (vec_t *) & BO[i * 4];
+ __builtin_mma_assemble_pair (, rb[1], rb[0]);
+ __builtin_mma_xvf64gerpp (, rowB, rowA[0]);
+ __builtin_mma_xvf64gerpp (, rowB, rowA[1]);
+ __builtin_mma_xvf64gerpp (, rowB, rowA[2]);
+ __builtin_mma_xvf64gerpp (, rowB, rowA[3]);
+ __builtin_mma_xvf64gerpp (, rowB, rowA[4]);
+ __builtin_mma_xvf64gerpp (, rowB, rowA[5]);
+ __builtin_mma_xvf64gerpp (, rowB, rowA[6]);
+ __builtin_mma_xvf64gerpp (, rowB, rowA[7]);
+   }
+ SAVE_ACC (, m, 0);
+ SAVE_ACC (, m, 4);
+ SAVE_ACC (, m, 2);
+ SAVE_ACC (, m, 6);
+ SAVE_ACC (, m, 8);
+ SAVE_ACC (, m, 12);
+ SAVE_ACC (, m, 10);
+ SAVE_ACC (, m, 14);
+ AO += k * 16;
+ BO += k * 4;
+ CO += 16;
+   }
+  B += k * 4;
+}
+}
+
+void
+init (double *matrix, int row, int column)
+{
+  for (int j = 0; j < column; j++)
+{
+  for (int i = 0; i < row; i++)
+   {
+ matrix[j * row + i] = (i * 16 + 2 + j) / 0.123;
+   }
+}
+}
+
+void
+init0 (double *matrix, double *matrix1, int row, int column)
+{
+  for (int j = 0; j < column; j++)
+for (int i = 0; i < row; i++)
+  matrix[j * row + i] = matrix1[j * row + i] = 0;
+}
+
+
+void
+print (const char *name, const double *matrix, int row, int column)
+{
+  p

Re: [PATCH] expr: Move reduce_bit_field target mode check [PR96151]

2020-07-10 Thread Aaron Sawdey via Gcc-patches
This fixed the ICE I was seeing, thanks.

Aaron Sawdey, Ph.D. saw...@linux.ibm.com
IBM Linux on POWER Toolchain
 

> On Jul 10, 2020, at 10:40 AM, Richard Sandiford  
> wrote:
> 
> In some cases, expand_expr_real_2 prefers to use the mode of the
> caller-suggested target instead of the mode of the expression when
> passing values to reduce_to_bit_field_precision.  E.g.:
> 
>  else if (target == 0)
>op0 = convert_to_mode (mode, op0,
>   TYPE_UNSIGNED (TREE_TYPE
>  (treeop0)));
>  else
>{
>  convert_move (target, op0,
>TYPE_UNSIGNED (TREE_TYPE (treeop0)));
>  op0 = target;
>}
> 
> where “op0” might not have “mode” for the “else” branch,
> but does for all the others.
> 
> reduce_to_bit_field_precision discards the suggested target if it
> has the wrong mode.  This patch moves that to expand_expr_real_2
> instead (conditional on reduce_bit_field).
> 
> Sorry for the breakage.  This is what I'd done in the original
> version of the patch, after checking all uses of REDUCE_BIT_FIELD.
> I then forgot why it was necessary and tried to “simplify” the
> patch for backports.
> 
> Tested on arm-linux-gnueabihf, where it restores bootstrap.
> Other tests still ongoing.  OK to install if it passes?
> 
> Richard
> 
> 
> gcc/
>   PR middle-end/96151
>   * expr.c (expand_expr_real_2): When reducing bit fields,
>   clear the target if it has a different mode from the expression.
>   (reduce_to_bit_field_precision): Don't do that here.  Instead
>   assert that the target already has the correct mode.
> ---
> gcc/expr.c | 9 +
> 1 file changed, 5 insertions(+), 4 deletions(-)
> 
> diff --git a/gcc/expr.c b/gcc/expr.c
> index 715edae819a..c7c3e9fd655 100644
> --- a/gcc/expr.c
> +++ b/gcc/expr.c
> @@ -8664,7 +8664,9 @@ expand_expr_real_2 (sepops ops, rtx target, 
> machine_mode tmode,
>   reduce_bit_field = (INTEGRAL_TYPE_P (type)
> && !type_has_mode_precision_p (type));
> 
> -  if (reduce_bit_field && modifier == EXPAND_STACK_PARM)
> +  if (reduce_bit_field
> +  && (modifier == EXPAND_STACK_PARM
> +   || (target && GET_MODE (target) != mode)))
> target = 0;
> 
>   /* Use subtarget as the target for operand 0 of a binary operation.  */
> @@ -11527,9 +11529,8 @@ reduce_to_bit_field_precision (rtx exp, rtx target, 
> tree type)
> {
>   scalar_int_mode mode = SCALAR_INT_TYPE_MODE (type);
>   HOST_WIDE_INT prec = TYPE_PRECISION (type);
> -  gcc_assert (GET_MODE (exp) == VOIDmode || GET_MODE (exp) == mode);
> -  if (target && GET_MODE (target) != mode)
> -target = 0;
> +  gcc_assert ((GET_MODE (exp) == VOIDmode || GET_MODE (exp) == mode)
> +   && (!target || GET_MODE (target) == mode));
> 
>   /* For constant values, reduce using wide_int_to_tree. */
>   if (poly_int_rtx_p (exp))



[PATCH] rs6000: Add execution tests for mma builtins. [v3]

2020-07-07 Thread Aaron Sawdey via Gcc-patches
This patch adds execution tests that use the MMA builtins and
check for the right answer, and a new test that checks whether
__builtin_cpu_supports and __builtin_cpu_is return sane answers.

One final time now that I've gotten things sorted out. OK for trunk
and backport to 10?

Thanks,
Aaron


2020-06-30  Rajalakshmi Srinivasaraghavan  
Aaron Sawdey  

gcc/testsuite/
* gcc.target/powerpc/p10-identify.c: New file.
* gcc.target/powerpc/mma-single-test.c: New file.
* gcc.target/powerpc/mma-double-test.c: New file.
---
 .../gcc.target/powerpc/mma-double-test.c  | 185 +
 .../gcc.target/powerpc/mma-single-test.c  | 193 ++
 .../gcc.target/powerpc/p10-identify.c |  32 +++
 3 files changed, 410 insertions(+)
 create mode 100755 gcc/testsuite/gcc.target/powerpc/mma-double-test.c
 create mode 100755 gcc/testsuite/gcc.target/powerpc/mma-single-test.c
 create mode 100644 gcc/testsuite/gcc.target/powerpc/p10-identify.c

diff --git a/gcc/testsuite/gcc.target/powerpc/mma-double-test.c 
b/gcc/testsuite/gcc.target/powerpc/mma-double-test.c
new file mode 100755
index 000..c892b8fd4ef
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/mma-double-test.c
@@ -0,0 +1,185 @@
+/* { dg-do run } */
+/* { dg-require-effective-target power10_hw } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
+
+#include 
+#include 
+#include 
+
+typedef unsigned char vec_t __attribute__ ((vector_size (16)));
+typedef double v4sf_t __attribute__ ((vector_size (16)));
+#define SAVE_ACC(ACC, ldc, J)  \
+ __builtin_mma_disassemble_acc (result, ACC); \
+ rowC = (v4sf_t *) [0*ldc+J]; \
+  rowC[0] += result[3] ; \
+  rowC = (v4sf_t *) [1*ldc+J]; \
+  rowC[0] += result[2] ; \
+  rowC = (v4sf_t *) [2*ldc+J]; \
+  rowC[0] += result[1] ; \
+  rowC = (v4sf_t *) [3*ldc+J]; \
+ rowC[0] += result[0] ;
+
+void
+MMA (int m, int n, int k, double *A, double *B, double *C)
+{
+  __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
+  v4sf_t result[4];
+  v4sf_t *rowC;
+  for (int l = 0; l < n; l += 4)
+{
+  double *CO;
+  double *AO;
+  AO = A;
+  CO = C;
+  C += m * 4;
+  for (int j = 0; j < m; j += 16)
+   {
+ double *BO = B;
+ __builtin_mma_xxsetaccz ();
+ __builtin_mma_xxsetaccz ();
+ __builtin_mma_xxsetaccz ();
+ __builtin_mma_xxsetaccz ();
+ __builtin_mma_xxsetaccz ();
+ __builtin_mma_xxsetaccz ();
+ __builtin_mma_xxsetaccz ();
+ __builtin_mma_xxsetaccz ();
+ unsigned long i;
+
+ for (i = 0; i < k; i++)
+   {
+ vec_t *rowA = (vec_t *) & AO[i * 16];
+ __vector_pair rowB;
+ vec_t *rb = (vec_t *) & BO[i * 4];
+ __builtin_mma_assemble_pair (, rb[1], rb[0]);
+ __builtin_mma_xvf64gerpp (, rowB, rowA[0]);
+ __builtin_mma_xvf64gerpp (, rowB, rowA[1]);
+ __builtin_mma_xvf64gerpp (, rowB, rowA[2]);
+ __builtin_mma_xvf64gerpp (, rowB, rowA[3]);
+ __builtin_mma_xvf64gerpp (, rowB, rowA[4]);
+ __builtin_mma_xvf64gerpp (, rowB, rowA[5]);
+ __builtin_mma_xvf64gerpp (, rowB, rowA[6]);
+ __builtin_mma_xvf64gerpp (, rowB, rowA[7]);
+   }
+ SAVE_ACC (, m, 0);
+ SAVE_ACC (, m, 4);
+ SAVE_ACC (, m, 2);
+ SAVE_ACC (, m, 6);
+ SAVE_ACC (, m, 8);
+ SAVE_ACC (, m, 12);
+ SAVE_ACC (, m, 10);
+ SAVE_ACC (, m, 14);
+ AO += k * 16;
+ BO += k * 4;
+ CO += 16;
+   }
+  B += k * 4;
+}
+}
+
+void
+init (double *matrix, int row, int column)
+{
+  for (int j = 0; j < column; j++)
+{
+  for (int i = 0; i < row; i++)
+   {
+ matrix[j * row + i] = (i * 16 + 2 + j) / 0.123;
+   }
+}
+}
+
+void
+init0 (double *matrix, double *matrix1, int row, int column)
+{
+  for (int j = 0; j < column; j++)
+for (int i = 0; i < row; i++)
+  matrix[j * row + i] = matrix1[j * row + i] = 0;
+}
+
+
+void
+print (const char *name, const double *matrix, int row, int column)
+{
+  printf ("Matrix %s has %d rows and %d columns:\n", name, row, column);
+  for (int i = 0; i < row; i++)
+{
+  for (int j = 0; j < column; j++)
+   {
+ printf ("%f ", matrix[j * row + i]);
+   }
+  printf ("\n");
+}
+  printf ("\n");
+}
+
+int
+main (int argc, char *argv[])
+{
+  int rowsA, colsB, common;
+  int i, j, k;
+  int ret = 0;
+
+  for (int t = 16; t <= 128; t += 16)
+{
+  for (int t1 = 4; t1 <= 16; t1 += 4)
+   {
+ rowsA = t;
+ colsB = t1;
+ common = 1;
+ /* printf ("Running test for rows = %d,cols = %d\n", t, t1); */
+ double A[rowsA * common];
+ 

[PATCH] rs6000: fix power10_hw test [v2]

2020-07-07 Thread Aaron Sawdey via Gcc-patches
The code snippet for this test was returning 1 if power10
instructions executed correctly. It should return 0 if the
test passes.

Approved offline by Segher with slight change. Will
push after posting.


* lib/target-supports.exp (check_power10_hw_available):
Return 0 for passing test.
---
 gcc/testsuite/lib/target-supports.exp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/gcc/testsuite/lib/target-supports.exp 
b/gcc/testsuite/lib/target-supports.exp
index 4bdcaef1132..848cb96aec4 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -2226,7 +2226,9 @@ proc check_power10_hw_available { } {
/* Set e first and use +r to check if pli actually works.  */
long e = -1;
asm ("pli %0,%1" : "+r" (e) : "n" (0x12345));
-   return (e == 0x12345);
+   if (e == 0x12345)
+ return 0;
+   return 1;
}
} "-mcpu=power10"
 }]
-- 
2.17.1



[PATCH] rs6000: fix power10_hw test

2020-07-07 Thread Aaron Sawdey via Gcc-patches
The code snippet for this test was returning 1 if power10
instructions executed correctly. It should return 0 if the
test passes.

OK for trunk and backport to 10?

Thanks,
   Aaron

* lib/target-supports.exp (check_power10_hw_available):
Return 0 for passing test.
---
 gcc/testsuite/lib/target-supports.exp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/lib/target-supports.exp 
b/gcc/testsuite/lib/target-supports.exp
index 4bdcaef1132..c1239535a4b 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -2226,7 +2226,7 @@ proc check_power10_hw_available { } {
/* Set e first and use +r to check if pli actually works.  */
long e = -1;
asm ("pli %0,%1" : "+r" (e) : "n" (0x12345));
-   return (e == 0x12345);
+   return (e != 0x12345);
}
} "-mcpu=power10"
 }]
-- 
2.17.1



[PATCH] rs6000: Add execution tests for mma builtins.

2020-07-07 Thread Aaron Sawdey via Gcc-patches
Updated slightly, removed -Wno-psabi as requested and also fixed the
fact that it wasn't actually checking __builtin_cpu_is or
__builtin_cpu_supports. OK for trunk and backport to 10?

Thanks,
Aaron

2020-06-30  Rajalakshmi Srinivasaraghavan  
Aaron Sawdey  

gcc/testsuite/
* gcc.target/powerpc/mma-single-test.c: New file.
* gcc.target/powerpc/mma-double-test.c: New file.
---
 .../gcc.target/powerpc/mma-double-test.c  | 204 +
 .../gcc.target/powerpc/mma-single-test.c  | 213 ++
 2 files changed, 417 insertions(+)
 create mode 100755 gcc/testsuite/gcc.target/powerpc/mma-double-test.c
 create mode 100755 gcc/testsuite/gcc.target/powerpc/mma-single-test.c

diff --git a/gcc/testsuite/gcc.target/powerpc/mma-double-test.c 
b/gcc/testsuite/gcc.target/powerpc/mma-double-test.c
new file mode 100755
index 000..9fdf6d9d2a9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/mma-double-test.c
@@ -0,0 +1,204 @@
+/* { dg-do run } */
+/* { dg-require-effective-target power10_hw } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
+
+#include 
+#include 
+#include 
+
+typedef unsigned char vec_t __attribute__ ((vector_size (16)));
+typedef double v4sf_t __attribute__ ((vector_size (16)));
+#define SAVE_ACC(ACC, ldc, J)  \
+ __builtin_mma_disassemble_acc (result, ACC); \
+ rowC = (v4sf_t *) [0*ldc+J]; \
+  rowC[0] += result[3] ; \
+  rowC = (v4sf_t *) [1*ldc+J]; \
+  rowC[0] += result[2] ; \
+  rowC = (v4sf_t *) [2*ldc+J]; \
+  rowC[0] += result[1] ; \
+  rowC = (v4sf_t *) [3*ldc+J]; \
+ rowC[0] += result[0] ;
+
+void
+MMA (int m, int n, int k, double *A, double *B, double *C)
+{
+  __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
+  v4sf_t result[4];
+  v4sf_t *rowC;
+  for (int l = 0; l < n; l += 4)
+{
+  double *CO;
+  double *AO;
+  AO = A;
+  CO = C;
+  C += m * 4;
+  for (int j = 0; j < m; j += 16)
+   {
+ double *BO = B;
+ __builtin_mma_xxsetaccz ();
+ __builtin_mma_xxsetaccz ();
+ __builtin_mma_xxsetaccz ();
+ __builtin_mma_xxsetaccz ();
+ __builtin_mma_xxsetaccz ();
+ __builtin_mma_xxsetaccz ();
+ __builtin_mma_xxsetaccz ();
+ __builtin_mma_xxsetaccz ();
+ unsigned long i;
+
+ for (i = 0; i < k; i++)
+   {
+ vec_t *rowA = (vec_t *) & AO[i * 16];
+ __vector_pair rowB;
+ vec_t *rb = (vec_t *) & BO[i * 4];
+ __builtin_mma_assemble_pair (, rb[1], rb[0]);
+ __builtin_mma_xvf64gerpp (, rowB, rowA[0]);
+ __builtin_mma_xvf64gerpp (, rowB, rowA[1]);
+ __builtin_mma_xvf64gerpp (, rowB, rowA[2]);
+ __builtin_mma_xvf64gerpp (, rowB, rowA[3]);
+ __builtin_mma_xvf64gerpp (, rowB, rowA[4]);
+ __builtin_mma_xvf64gerpp (, rowB, rowA[5]);
+ __builtin_mma_xvf64gerpp (, rowB, rowA[6]);
+ __builtin_mma_xvf64gerpp (, rowB, rowA[7]);
+   }
+ SAVE_ACC (, m, 0);
+ SAVE_ACC (, m, 4);
+ SAVE_ACC (, m, 2);
+ SAVE_ACC (, m, 6);
+ SAVE_ACC (, m, 8);
+ SAVE_ACC (, m, 12);
+ SAVE_ACC (, m, 10);
+ SAVE_ACC (, m, 14);
+ AO += k * 16;
+ BO += k * 4;
+ CO += 16;
+   }
+  B += k * 4;
+}
+}
+
+void
+init (double *matrix, int row, int column)
+{
+  for (int j = 0; j < column; j++)
+{
+  for (int i = 0; i < row; i++)
+   {
+ matrix[j * row + i] = (i * 16 + 2 + j) / 0.123;
+   }
+}
+}
+
+void
+init0 (double *matrix, double *matrix1, int row, int column)
+{
+  for (int j = 0; j < column; j++)
+for (int i = 0; i < row; i++)
+  matrix[j * row + i] = matrix1[j * row + i] = 0;
+}
+
+
+void
+print (const char *name, const double *matrix, int row, int column)
+{
+  printf ("Matrix %s has %d rows and %d columns:\n", name, row, column);
+  for (int i = 0; i < row; i++)
+{
+  for (int j = 0; j < column; j++)
+   {
+ printf ("%f ", matrix[j * row + i]);
+   }
+  printf ("\n");
+}
+  printf ("\n");
+}
+
+int
+main (int argc, char *argv[])
+{
+  int rowsA, colsB, common;
+  int i, j, k;
+  int ret = 0;
+
+  for (int t = 16; t <= 128; t += 16)
+{
+  for (int t1 = 4; t1 <= 16; t1 += 4)
+   {
+ rowsA = t;
+ colsB = t1;
+ common = 1;
+ /* printf ("Running test for rows = %d,cols = %d\n", t, t1); */
+ double A[rowsA * common];
+ double B[common * colsB];
+ double C[rowsA * colsB];
+ double D[rowsA * colsB];
+
+
+ init (A, rowsA, common);
+ init (B, common, colsB);
+ init0 (C, D, rowsA, colsB);
+ MMA (rowsA, colsB, common, A

[PATCH] rs6000: Add execution tests for mma builtins.

2020-06-30 Thread Aaron Sawdey via Gcc-patches
This patch adds execution tests that use the MMA builtins,
checks for the right answer, and checks that __builtin_cpu_supports
and __builtin_cpu_is return sane answers given that the code
executed correctly.

Tested against P10 sim, should not execute anywhere else due to 
requiring power10_hw. Actually the power10_hw test I think requires
current glibc to pick up the change that lets
__builtin_cpu_is("power10") work. OK for trunk?

Thanks,
   Aaron

2020-06-30  Rajalakshmi Srinivasaraghavan  
    Aaron Sawdey  

gcc/testsuite/
* gcc.target/powerpc/mma-single-test.c: New file.
* gcc.target/powerpc/mma-double-test.c: New file.
---
 .../gcc.target/powerpc/mma-double-test.c  | 211 +
 .../gcc.target/powerpc/mma-single-test.c  | 220 ++
 2 files changed, 431 insertions(+)
 create mode 100755 gcc/testsuite/gcc.target/powerpc/mma-double-test.c
 create mode 100755 gcc/testsuite/gcc.target/powerpc/mma-single-test.c

diff --git a/gcc/testsuite/gcc.target/powerpc/mma-double-test.c 
b/gcc/testsuite/gcc.target/powerpc/mma-double-test.c
new file mode 100755
index 000..e3807fa2eab
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/mma-double-test.c
@@ -0,0 +1,211 @@
+/* { dg-do run } */
+/* { dg-require-effective-target power10_hw } */
+/* { dg-options "-Wno-psabi -mdejagnu-cpu=power10 -O2" } */
+
+#include 
+#include 
+#include 
+
+typedef unsigned char vec_t __attribute__ ((vector_size (16)));
+typedef double v4sf_t __attribute__ ((vector_size (16)));
+#define SAVE_ACC(ACC, ldc, J)  \
+ __builtin_mma_disassemble_acc (result, ACC); \
+ rowC = (v4sf_t *) [0*ldc+J]; \
+  rowC[0] += result[3] ; \
+  rowC = (v4sf_t *) [1*ldc+J]; \
+  rowC[0] += result[2] ; \
+  rowC = (v4sf_t *) [2*ldc+J]; \
+  rowC[0] += result[1] ; \
+  rowC = (v4sf_t *) [3*ldc+J]; \
+ rowC[0] += result[0] ;
+
+void
+MMA (int m, int n, int k, double *A, double *B, double *C)
+{
+  __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
+  v4sf_t result[4];
+  v4sf_t *rowC;
+  for (int l = 0; l < n; l += 4)
+{
+  double *CO;
+  double *AO;
+  AO = A;
+  CO = C;
+  C += m * 4;
+  for (int j = 0; j < m; j += 16)
+   {
+ double *BO = B;
+ __builtin_mma_xxsetaccz ();
+ __builtin_mma_xxsetaccz ();
+ __builtin_mma_xxsetaccz ();
+ __builtin_mma_xxsetaccz ();
+ __builtin_mma_xxsetaccz ();
+ __builtin_mma_xxsetaccz ();
+ __builtin_mma_xxsetaccz ();
+ __builtin_mma_xxsetaccz ();
+ unsigned long i;
+
+ for (i = 0; i < k; i++)
+   {
+ vec_t *rowA = (vec_t *) & AO[i * 16];
+ __vector_pair rowB;
+ vec_t *rb = (vec_t *) & BO[i * 4];
+ __builtin_mma_assemble_pair (, rb[1], rb[0]);
+ __builtin_mma_xvf64gerpp (, rowB, rowA[0]);
+ __builtin_mma_xvf64gerpp (, rowB, rowA[1]);
+ __builtin_mma_xvf64gerpp (, rowB, rowA[2]);
+ __builtin_mma_xvf64gerpp (, rowB, rowA[3]);
+ __builtin_mma_xvf64gerpp (, rowB, rowA[4]);
+ __builtin_mma_xvf64gerpp (, rowB, rowA[5]);
+ __builtin_mma_xvf64gerpp (, rowB, rowA[6]);
+ __builtin_mma_xvf64gerpp (, rowB, rowA[7]);
+   }
+ SAVE_ACC (, m, 0);
+ SAVE_ACC (, m, 4);
+ SAVE_ACC (, m, 2);
+ SAVE_ACC (, m, 6);
+ SAVE_ACC (, m, 8);
+ SAVE_ACC (, m, 12);
+ SAVE_ACC (, m, 10);
+ SAVE_ACC (, m, 14);
+ AO += k * 16;
+ BO += k * 4;
+ CO += 16;
+   }
+  B += k * 4;
+}
+}
+
+void
+init (double *matrix, int row, int column)
+{
+  for (int j = 0; j < column; j++)
+{
+  for (int i = 0; i < row; i++)
+   {
+ matrix[j * row + i] = (i * 16 + 2 + j) / 0.123;
+   }
+}
+}
+
+void
+init0 (double *matrix, double *matrix1, int row, int column)
+{
+  for (int j = 0; j < column; j++)
+for (int i = 0; i < row; i++)
+  matrix[j * row + i] = matrix1[j * row + i] = 0;
+}
+
+
+void
+print (const char *name, const double *matrix, int row, int column)
+{
+  printf ("Matrix %s has %d rows and %d columns:\n", name, row, column);
+  for (int i = 0; i < row; i++)
+{
+  for (int j = 0; j < column; j++)
+   {
+ printf ("%f ", matrix[j * row + i]);
+   }
+  printf ("\n");
+}
+  printf ("\n");
+}
+
+int
+main (int argc, char *argv[])
+{
+  int rowsA, colsB, common;
+  int i, j, k;
+  int ret = 0;
+
+  for (int t = 16; t <= 128; t += 16)
+{
+  for (int t1 = 4; t1 <= 16; t1 += 4)
+   {
+ rowsA = t;
+ colsB = t1;
+ common = 1;
+ /* printf ("Running test for rows = %d,cols = %d\n", t, t1); */
+ double A[rowsA * common];
+ double B[comm

[PATCH] rs6000: Allow --with-cpu=power10

2020-06-23 Thread Aaron Sawdey via Gcc-patches
Update config.gcc so that we can use --with-cpu=power10.

I've tested that this does do the expected thing 
with --with-cpu=power10 and also that it still builds and
bootstraps correctly using --with-cpu=power9 on power9. If there isn't
any other testing I need to do for this, ok for trunk?

Thanks!
   Aaron

* config.gcc: Identify power10 as a 64-bit processor and as valid
for --with-cpu and --with-tune.
---
 gcc/config.gcc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gcc/config.gcc b/gcc/config.gcc
index 365263a0f46..829b6f757f2 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -514,7 +514,7 @@ powerpc*-*-*)
extra_headers="${extra_headers} ppu_intrinsics.h spu2vmx.h vec_types.h 
si2vmx.h"
extra_headers="${extra_headers} amo.h"
case x$with_cpu in
-   
xpowerpc64|xdefault64|x6[23]0|x970|xG5|xpower[3456789]|xpower6x|xrs64a|xcell|xa2|xe500mc64|xe5500|xe6500|xfuture)
+   
xpowerpc64|xdefault64|x6[23]0|x970|xG5|xpower[3456789]|xpower10|xpower6x|xrs64a|xcell|xa2|xe500mc64|xe5500|xe6500|xfuture)
cpu_is_64bit=yes
;;
esac
@@ -4912,7 +4912,7 @@ case "${target}" in
eval "with_$which=405"
;;
"" | common | native \
-   | power[3456789] | power5+ | power6x \
+   | power[3456789] | power10 | power5+ | power6x \
| powerpc | powerpc64 | powerpc64le \
| rs64 \
| 401 | 403 | 405 | 405fp | 440 | 440fp | 464 | 464fp \
-- 
2.17.1



Re: [pushed][PATCH] identify lfs prefixed case PR95347

2020-06-15 Thread Aaron Sawdey via Gcc-patches
Now that this has been in trunk for a bit with no issues, ok to back port to 10?


Aaron Sawdey, Ph.D. saw...@linux.ibm.com
IBM Linux on POWER Toolchain
 

> On Jun 3, 2020, at 4:10 PM, Aaron Sawdey  wrote:
> 
> This passed regstrap and was approved offline by Segher, posting
> the final form (minus my debug code, oops).
> 
> The same problem also arises for plfs where prefixed_load_p()
> doesn't recognize it so we get just lfs in the asm output
> with an @pcrel address.
> 
>   PR target/95347
>   * config/rs6000/rs6000.c (is_stfs_insn): Rename to
>   is_lfs_stfs_insn and make it recognize lfs as well.
>   (prefixed_store_p): Use is_lfs_stfs_insn().
>   (prefixed_load_p): Use is_lfs_stfs_insn() to recognize lfs.
> ---
> gcc/config/rs6000/rs6000.c | 37 +
> 1 file changed, 25 insertions(+), 12 deletions(-)
> 
> diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
> index ba9069ecc3b..42d517c1f65 100644
> --- a/gcc/config/rs6000/rs6000.c
> +++ b/gcc/config/rs6000/rs6000.c
> @@ -24980,14 +24980,18 @@ address_to_insn_form (rtx addr,
>   return INSN_FORM_BAD;
> }
> 
> -/* Helper function to see if we're potentially looking at stfs.
> +/* Helper function to see if we're potentially looking at lfs/stfs.
>- PARALLEL containing a SET and a CLOBBER
> -   - SET is from UNSPEC_SI_FROM_SF to MEM:SI
> -   - CLOBBER is a V4SF
> +   - stfs:
> +- SET is from UNSPEC_SI_FROM_SF to MEM:SI
> +- CLOBBER is a V4SF
> +   - lfs:
> +- SET is from UNSPEC_SF_FROM_SI to REG:SF
> +- CLOBBER is a DI
>  */
> 
> static bool
> -is_stfs_insn (rtx_insn *insn)
> +is_lfs_stfs_insn (rtx_insn *insn)
> {
>   rtx pattern = PATTERN (insn);
>   if (GET_CODE (pattern) != PARALLEL)
> @@ -25013,16 +25017,22 @@ is_stfs_insn (rtx_insn *insn)
>   rtx src = SET_SRC (set);
>   rtx scratch = SET_DEST (clobber);
> 
> -  if (GET_CODE (src) != UNSPEC || XINT (src, 1) != UNSPEC_SI_FROM_SF)
> +  if (GET_CODE (src) != UNSPEC)
> return false;
> 
> -  if (GET_CODE (dest) != MEM || GET_MODE (dest) != SImode)
> -return false;
> +  /* stfs case.  */
> +  if (XINT (src, 1) == UNSPEC_SI_FROM_SF
> +  && GET_CODE (dest) == MEM && GET_MODE (dest) == SImode
> +  && GET_CODE (scratch) == SCRATCH && GET_MODE (scratch) == V4SFmode)
> +return true;
> 
> -  if (GET_CODE (scratch) != SCRATCH || GET_MODE (scratch) != V4SFmode)
> -return false;
> +  /* lfs case.  */
> +  if (XINT (src, 1) == UNSPEC_SF_FROM_SI
> +  && GET_CODE (dest) == REG && GET_MODE (dest) == SFmode
> +  && GET_CODE (scratch) == SCRATCH && GET_MODE (scratch) == DImode)
> +return true;
> 
> -  return true;
> +  return false;
> }
> 
> /* Helper function to take a REG and a MODE and turn it into the non-prefixed
> @@ -25135,7 +25145,10 @@ prefixed_load_p (rtx_insn *insn)
>   else
> non_prefixed = reg_to_non_prefixed (reg, mem_mode);
> 
> -  return address_is_prefixed (XEXP (mem, 0), mem_mode, non_prefixed);
> +  if (non_prefixed == NON_PREFIXED_X && is_lfs_stfs_insn (insn))
> +return address_is_prefixed (XEXP (mem, 0), mem_mode, 
> NON_PREFIXED_DEFAULT);
> +  else
> +return address_is_prefixed (XEXP (mem, 0), mem_mode, non_prefixed);
> }
> 
> /* Whether a store instruction is a prefixed instruction.  This is called from
> @@ -25170,7 +25183,7 @@ prefixed_store_p (rtx_insn *insn)
>   /* Need to make sure we aren't looking at a stfs which doesn't look
>  like the other things reg_to_non_prefixed/address_is_prefixed
>  looks for.  */
> -  if (non_prefixed == NON_PREFIXED_X && is_stfs_insn (insn))
> +  if (non_prefixed == NON_PREFIXED_X && is_lfs_stfs_insn (insn))
> return address_is_prefixed (addr, mem_mode, NON_PREFIXED_DEFAULT);
>   else
> return address_is_prefixed (addr, mem_mode, non_prefixed);
> -- 
> 2.17.1
> 



[pushed][PATCH] identify lfs prefixed case PR95347

2020-06-03 Thread Aaron Sawdey via Gcc-patches
This passed regstrap and was approved offline by Segher, posting
the final form (minus my debug code, oops).

The same problem also arises for plfs where prefixed_load_p()
doesn't recognize it so we get just lfs in the asm output
with an @pcrel address.

PR target/95347
* config/rs6000/rs6000.c (is_stfs_insn): Rename to
is_lfs_stfs_insn and make it recognize lfs as well.
(prefixed_store_p): Use is_lfs_stfs_insn().
(prefixed_load_p): Use is_lfs_stfs_insn() to recognize lfs.
---
 gcc/config/rs6000/rs6000.c | 37 +
 1 file changed, 25 insertions(+), 12 deletions(-)

diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index ba9069ecc3b..42d517c1f65 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -24980,14 +24980,18 @@ address_to_insn_form (rtx addr,
   return INSN_FORM_BAD;
 }
 
-/* Helper function to see if we're potentially looking at stfs.
+/* Helper function to see if we're potentially looking at lfs/stfs.
- PARALLEL containing a SET and a CLOBBER
-   - SET is from UNSPEC_SI_FROM_SF to MEM:SI
-   - CLOBBER is a V4SF
+   - stfs:
+- SET is from UNSPEC_SI_FROM_SF to MEM:SI
+- CLOBBER is a V4SF
+   - lfs:
+- SET is from UNSPEC_SF_FROM_SI to REG:SF
+- CLOBBER is a DI
  */
 
 static bool
-is_stfs_insn (rtx_insn *insn)
+is_lfs_stfs_insn (rtx_insn *insn)
 {
   rtx pattern = PATTERN (insn);
   if (GET_CODE (pattern) != PARALLEL)
@@ -25013,16 +25017,22 @@ is_stfs_insn (rtx_insn *insn)
   rtx src = SET_SRC (set);
   rtx scratch = SET_DEST (clobber);
 
-  if (GET_CODE (src) != UNSPEC || XINT (src, 1) != UNSPEC_SI_FROM_SF)
+  if (GET_CODE (src) != UNSPEC)
 return false;
 
-  if (GET_CODE (dest) != MEM || GET_MODE (dest) != SImode)
-return false;
+  /* stfs case.  */
+  if (XINT (src, 1) == UNSPEC_SI_FROM_SF
+  && GET_CODE (dest) == MEM && GET_MODE (dest) == SImode
+  && GET_CODE (scratch) == SCRATCH && GET_MODE (scratch) == V4SFmode)
+return true;
 
-  if (GET_CODE (scratch) != SCRATCH || GET_MODE (scratch) != V4SFmode)
-return false;
+  /* lfs case.  */
+  if (XINT (src, 1) == UNSPEC_SF_FROM_SI
+  && GET_CODE (dest) == REG && GET_MODE (dest) == SFmode
+  && GET_CODE (scratch) == SCRATCH && GET_MODE (scratch) == DImode)
+return true;
 
-  return true;
+  return false;
 }
 
 /* Helper function to take a REG and a MODE and turn it into the non-prefixed
@@ -25135,7 +25145,10 @@ prefixed_load_p (rtx_insn *insn)
   else
 non_prefixed = reg_to_non_prefixed (reg, mem_mode);
 
-  return address_is_prefixed (XEXP (mem, 0), mem_mode, non_prefixed);
+  if (non_prefixed == NON_PREFIXED_X && is_lfs_stfs_insn (insn))
+return address_is_prefixed (XEXP (mem, 0), mem_mode, NON_PREFIXED_DEFAULT);
+  else
+return address_is_prefixed (XEXP (mem, 0), mem_mode, non_prefixed);
 }
 
 /* Whether a store instruction is a prefixed instruction.  This is called from
@@ -25170,7 +25183,7 @@ prefixed_store_p (rtx_insn *insn)
   /* Need to make sure we aren't looking at a stfs which doesn't look
  like the other things reg_to_non_prefixed/address_is_prefixed
  looks for.  */
-  if (non_prefixed == NON_PREFIXED_X && is_stfs_insn (insn))
+  if (non_prefixed == NON_PREFIXED_X && is_lfs_stfs_insn (insn))
 return address_is_prefixed (addr, mem_mode, NON_PREFIXED_DEFAULT);
   else
 return address_is_prefixed (addr, mem_mode, non_prefixed);
-- 
2.17.1



[PATCH] rs6000: identify lfs prefixed case PR95347

2020-06-02 Thread Aaron Sawdey via Gcc-patches
The same problem also arises for plfs where prefixed_load_p()
doesn't recognize it so we get just lfs in the asm output
with a @pcrel address.

OK for trunk if regstrap on ppc64le passes?

Thanks,
   Aaron


PR target/95347
* config/rs6000/rs6000.c (is_stfs_insn): Rename to
is_lfs_stfs_insn and make it recognize lfs as well.
(prefixed_store_p): Use is_lfs_stfs_insn().
(prefixed_load_p): Use is_lfs_stfs_insn() to recognize lfs.
---
 gcc/config/rs6000/rs6000.c | 41 +++---
 1 file changed, 29 insertions(+), 12 deletions(-)

diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index ba9069ecc3b..8ed8ae04e7a 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -24980,14 +24980,18 @@ address_to_insn_form (rtx addr,
   return INSN_FORM_BAD;
 }
 
-/* Helper function to see if we're potentially looking at stfs.
+/* Helper function to see if we're potentially looking at lfs/stfs.
- PARALLEL containing a SET and a CLOBBER
-   - SET is from UNSPEC_SI_FROM_SF to MEM:SI
-   - CLOBBER is a V4SF
+   - stfs:
+- SET is from UNSPEC_SI_FROM_SF to MEM:SI
+- CLOBBER is a V4SF
+   - lfs:
+- SET is from UNSPEC_SF_FROM_SI to REG:SF
+- CLOBBER is a DI
  */
 
 static bool
-is_stfs_insn (rtx_insn *insn)
+is_lfs_stfs_insn (rtx_insn *insn)
 {
   rtx pattern = PATTERN (insn);
   if (GET_CODE (pattern) != PARALLEL)
@@ -25013,16 +25017,22 @@ is_stfs_insn (rtx_insn *insn)
   rtx src = SET_SRC (set);
   rtx scratch = SET_DEST (clobber);
 
-  if (GET_CODE (src) != UNSPEC || XINT (src, 1) != UNSPEC_SI_FROM_SF)
+  if (GET_CODE (src) != UNSPEC)
 return false;
 
-  if (GET_CODE (dest) != MEM || GET_MODE (dest) != SImode)
-return false;
+  /* stfs case.  */
+  if (XINT (src, 1) == UNSPEC_SI_FROM_SF
+  && GET_CODE (dest) == MEM && GET_MODE (dest) == SImode
+  && GET_CODE (scratch) == SCRATCH && GET_MODE (scratch) == V4SFmode)
+return true;
 
-  if (GET_CODE (scratch) != SCRATCH || GET_MODE (scratch) != V4SFmode)
-return false;
+  /* lfs case.  */
+  if (XINT (src, 1) == UNSPEC_SF_FROM_SI
+  && GET_CODE (dest) == REG && GET_MODE (dest) == SFmode
+  && GET_CODE (scratch) == SCRATCH && GET_MODE (scratch) == DImode)
+return true;
 
-  return true;
+  return false;
 }
 
 /* Helper function to take a REG and a MODE and turn it into the non-prefixed
@@ -25135,7 +25145,14 @@ prefixed_load_p (rtx_insn *insn)
   else
 non_prefixed = reg_to_non_prefixed (reg, mem_mode);
 
-  return address_is_prefixed (XEXP (mem, 0), mem_mode, non_prefixed);
+  fprintf(stderr,"prefixed_load_p regmode %s memmode %s non_prefixed %d\n",
+ GET_MODE_NAME(reg_mode), GET_MODE_NAME(mem_mode), non_prefixed);
+  debug_rtx(insn);
+
+  if (non_prefixed == NON_PREFIXED_X && is_lfs_stfs_insn (insn))
+return address_is_prefixed (XEXP (mem, 0), mem_mode, NON_PREFIXED_DEFAULT);
+  else
+return address_is_prefixed (XEXP (mem, 0), mem_mode, non_prefixed);
 }
 
 /* Whether a store instruction is a prefixed instruction.  This is called from
@@ -25170,7 +25187,7 @@ prefixed_store_p (rtx_insn *insn)
   /* Need to make sure we aren't looking at a stfs which doesn't look
  like the other things reg_to_non_prefixed/address_is_prefixed
  looks for.  */
-  if (non_prefixed == NON_PREFIXED_X && is_stfs_insn (insn))
+  if (non_prefixed == NON_PREFIXED_X && is_lfs_stfs_insn (insn))
 return address_is_prefixed (addr, mem_mode, NON_PREFIXED_DEFAULT);
   else
 return address_is_prefixed (addr, mem_mode, non_prefixed);
-- 
2.17.1



[PATCH] rs6000: PR target/95347 Correctly identify stfs if prefixed

2020-05-29 Thread Aaron Sawdey via Gcc-patches
Because reg_to_non_prefixed() only looks at the register being used, it
doesn't get the right answer for stfs, which leads to us not seeing
that it has a PCREL symbol ref.  This patch works around this by
introducing a helper function that inspects the insn to see if it is in
fact a stfs. Then if we use NON_PREFIXED_DEFAULT, address_to_insn_form()
can see that it has the PCREL symbol ref.

OK for trunk if regstrap on ppc64le passes?

Thanks,
   Aaron

2020-05-29  Aaron Sawdey  

PR target/95347
* config/rs6000/rs6000.c (prefixed_store_p): Add special case
for stfs.
(is_stfs_insn): New helper function.
---
 gcc/config/rs6000/rs6000.c | 60 +-
 1 file changed, 59 insertions(+), 1 deletion(-)

diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index 8435bc15d72..d58fca4 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -24980,6 +24980,58 @@ address_to_insn_form (rtx addr,
   return INSN_FORM_BAD;
 }
 
+/* Helper function to see if we're potentially looking at stfs that
+   could be pstfs.  */
+
+static bool
+is_stfs_insn (rtx_insn *insn)
+{
+  rtx pattern=PATTERN (insn);
+  if (GET_CODE (pattern) != PARALLEL)
+return false;
+
+  /* This should be a parallel with exactly one set and one clobber.  */
+  int i;
+  rtx set=NULL, clobber=NULL;
+  for (i = 0; i < XVECLEN (pattern, 0); i++)
+{
+  rtx elt = XVECEXP (pattern, 0, i);
+  if (GET_CODE (elt) == SET)
+   {
+ if (set)
+   return false;
+ set = elt;
+   }
+  else if (GET_CODE (elt) == CLOBBER)
+   {
+ if (clobber)
+   return false;
+ clobber = elt;
+   }
+  else
+   return false;
+}
+
+  /* All we care is that the destination of the SET is a mem:SI,
+ the source should be an UNSPEC_SI_FROM_SF, and the clobber
+ should be a scratch:V4SF.  */
+
+  rtx dest = XEXP (set, 0);
+  rtx src = XEXP (set, 1);
+  rtx scratch = XEXP (clobber, 0);
+
+  if (GET_CODE (src) != UNSPEC || XINT (src, 1) != UNSPEC_SI_FROM_SF)
+return false;
+
+  if (GET_CODE (dest) != MEM || GET_MODE (dest) != SImode)
+return false;
+
+  if (GET_CODE (scratch) != SCRATCH || GET_MODE (scratch) != V4SFmode)
+return false;
+
+  return true;
+}
+
 /* Helper function to take a REG and a MODE and turn it into the non-prefixed
instruction format (D/DS/DQ) used for offset memory.  */
 
@@ -25119,8 +25171,14 @@ prefixed_store_p (rtx_insn *insn)
 return false;
 
   machine_mode mem_mode = GET_MODE (mem);
+  rtx addr = XEXP (mem, 0);
   enum non_prefixed_form non_prefixed = reg_to_non_prefixed (reg, mem_mode);
-  return address_is_prefixed (XEXP (mem, 0), mem_mode, non_prefixed);
+  /* Need to make sure we aren't looking at a stfs which doesn't
+ looking like the other things that we are looking for.  */
+  if (non_prefixed == NON_PREFIXED_X && is_stfs_insn (insn))
+return address_is_prefixed (addr, mem_mode, NON_PREFIXED_DEFAULT);
+  else
+return address_is_prefixed (addr, mem_mode, non_prefixed);
 }
 
 /* Whether a load immediate or add instruction is a prefixed instruction.  This
-- 
2.17.1



[PATCH][v3], rs6000: Use plq/pstq for atomic_{load, store} (PR94622)

2020-04-21 Thread Aaron Sawdey via Gcc-patches
For future architecture with prefix instructions, always use plq/pstq
rather than lq/stq for atomic load of quadword. Then we never have to
do the doubleword swap on little endian. Before this fix, -mno-pcrel
would generate lq with the doubleword swap (which was ok) and -mpcrel
would generate plq, also with the doubleword swap, which was wrong.

While adding comments I realized we have exactly the same problem with
pstq/stq so I have added fixes for that as well. Assuming that regstrap
passes, OK for trunk?

Thanks,
   Aaron

2020-04-20  Aaron Sawdey  

PR target/94622
* config/rs6000/sync.md (load_quadpti): Add attr "prefixed"
if TARGET_PREFIXED.
(store_quadpti): Ditto.
(atomic_load): Do not swap doublewords if TARGET_PREFIXED as
plq will be used and doesn't need it.
(atomic_store): Ditto, for pstq.
---
 gcc/config/rs6000/sync.md | 27 ++-
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/gcc/config/rs6000/sync.md b/gcc/config/rs6000/sync.md
index f27edc77b6a..bf529fc8268 100644
--- a/gcc/config/rs6000/sync.md
+++ b/gcc/config/rs6000/sync.md
@@ -122,6 +122,7 @@ (define_insn "loadsync_"
   [(set_attr "type" "isync")
(set_attr "length" "12")])
 
+;; If TARGET_PREFIXED, always use plq rather than lq.
 (define_insn "load_quadpti"
   [(set (match_operand:PTI 0 "quad_int_reg_operand" "=")
(unspec:PTI
@@ -129,8 +130,18 @@ (define_insn "load_quadpti"
   "TARGET_SYNC_TI
&& !reg_mentioned_p (operands[0], operands[1])"
   "lq %0,%1"
-  [(set_attr "type" "load")])
-
+  [(set_attr "type" "load")
+   (set (attr "prefixed") (if_then_else (match_test "TARGET_PREFIXED")
+(const_string "yes")
+(const_string "no")))])
+
+;; Pattern load_quadpti will always use plq for atomic TImode if
+;; TARGET_PREFIXED.  It has the correct doubleword ordering on either LE
+;; or BE, so we can just move the result into the output register and
+;; do not need to do the doubleword swap for LE. Also this avoids any
+;; confusion about whether the lq vs plq might be used based on whether
+;; op1 has PC-relative addressing. We could potentially allow BE to
+;; use lq because it doesn't have the doubleword ordering problem.
 (define_expand "atomic_load"
   [(set (match_operand:AINT 0 "register_operand")  ;; output
(match_operand:AINT 1 "memory_operand"));; memory
@@ -162,7 +173,7 @@ (define_expand "atomic_load"
 
   emit_insn (gen_load_quadpti (pti_reg, op1));
 
-  if (WORDS_BIG_ENDIAN)
+  if (WORDS_BIG_ENDIAN || TARGET_PREFIXED)
emit_move_insn (op0, gen_lowpart (TImode, pti_reg));
   else
{
@@ -186,14 +197,20 @@ (define_expand "atomic_load"
   DONE;
 })
 
+;; If TARGET_PREFIXED, always use pstq rather than stq.
 (define_insn "store_quadpti"
   [(set (match_operand:PTI 0 "quad_memory_operand" "=wQ")
(unspec:PTI
 [(match_operand:PTI 1 "quad_int_reg_operand" "r")] UNSPEC_LSQ))]
   "TARGET_SYNC_TI"
   "stq %1,%0"
-  [(set_attr "type" "store")])
+  [(set_attr "type" "store")
+   (set (attr "prefixed") (if_then_else (match_test "TARGET_PREFIXED")
+(const_string "yes")
+(const_string "no")))])
 
+;; Pattern store_quadpti will always use pstq if TARGET_PREFIXED,
+;; so the doubleword swap is never needed in that case.
 (define_expand "atomic_store"
   [(set (match_operand:AINT 0 "memory_operand");; memory
(match_operand:AINT 1 "register_operand"))  ;; input
@@ -232,7 +249,7 @@ (define_expand "atomic_store"
  operands[0] = op0 = replace_equiv_address (op0, new_addr);
}
 
-  if (WORDS_BIG_ENDIAN)
+  if (WORDS_BIG_ENDIAN || TARGET_PREFIXED)
emit_move_insn (pti_reg, gen_lowpart (PTImode, op1));
   else
{
-- 
2.17.1



Re: [PATCH][v2], rs6000, PR/target 94622, Be more careful with plq for atomic_load

2020-04-20 Thread Aaron Sawdey via Gcc-patches
For future architecture with prefix instructions, always use plq
rather than lq for atomic load of quadword. Then we never have to
do the doubleword swap on little endian. Before this fix, -mno-pcrel
would generate lq with the doubleword swap (which was ok) and -mpcrel
would generate plq, also with the doubleword swap, which was wrong.

So, of course you can't use set_attr with an if_then_else. The below
code actually builds and passes regstrap on ppc64le power9.

OK for trunk?

Thanks,
Aaron


2020-04-20  Aaron Sawdey  

PR target/94622
* config/rs6000/sync.md (load_quadpti): Add attr "prefixed"
if TARGET_PREFIXED.
(atomic_load): Do not swap doublewords if TARGET_PREFIXED as
plq will be used and doesn't need it.
---
 gcc/config/rs6000/sync.md | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/gcc/config/rs6000/sync.md b/gcc/config/rs6000/sync.md
index f27edc77b6a..96cef082dd5 100644
--- a/gcc/config/rs6000/sync.md
+++ b/gcc/config/rs6000/sync.md
@@ -129,7 +129,10 @@ (define_insn "load_quadpti"
   "TARGET_SYNC_TI
&& !reg_mentioned_p (operands[0], operands[1])"
   "lq %0,%1"
-  [(set_attr "type" "load")])
+  [(set_attr "type" "load")
+   (set (attr "prefixed") (if_then_else (match_test "TARGET_PREFIXED")
+(const_string "yes")
+(const_string "no")))])
 
 (define_expand "atomic_load"
   [(set (match_operand:AINT 0 "register_operand")  ;; output
@@ -162,7 +165,7 @@ (define_expand "atomic_load"
 
   emit_insn (gen_load_quadpti (pti_reg, op1));
 
-  if (WORDS_BIG_ENDIAN)
+  if (WORDS_BIG_ENDIAN || TARGET_PREFIXED)
emit_move_insn (op0, gen_lowpart (TImode, pti_reg));
   else
{
-- 
2.17.1



[PATCH], rs6000, PR/target 94622, Be more careful with plq for atomic_load

2020-04-20 Thread Aaron Sawdey via Gcc-patches
For future architecture with prefix instructions, always use plq
rather than lq for atomi load of quadword. Then we never have to
do the doubleword swap on little endian. Before this fix, -mno-pcrel
would generate lq with the doubleword swap (which was ok) and -mpcrel
would generate plq, also with the doubleword swap, which was wrong.

OK for trunk if regstrap passes on ppc64le power9?

Thanks,
   Aaron

2020-04-20  Aaron Sawdey  

PR target/94622
* config/rs6000/sync.md (load_quadpti): Make this have attr prefixed
if TARGET_PREFIXED.
(atomic_load): Do not swap doublewords if TARGET_PREFIXED as
plq will be used and doesn't need it.
---
 gcc/config/rs6000/sync.md | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/gcc/config/rs6000/sync.md b/gcc/config/rs6000/sync.md
index f27edc77b6a..64dfda6ef75 100644
--- a/gcc/config/rs6000/sync.md
+++ b/gcc/config/rs6000/sync.md
@@ -129,7 +129,10 @@ (define_insn "load_quadpti"
   "TARGET_SYNC_TI
&& !reg_mentioned_p (operands[0], operands[1])"
   "lq %0,%1"
-  [(set_attr "type" "load")])
+  [(set_attr "type" "load")
+   (set_attr "prefixed" (if_then_else (match_test "TARGET_PREFIXED")
+  (const_string "yes")
+  (const_string "no")))])
 
 (define_expand "atomic_load"
   [(set (match_operand:AINT 0 "register_operand")  ;; output
@@ -162,7 +165,7 @@ (define_expand "atomic_load"
 
   emit_insn (gen_load_quadpti (pti_reg, op1));
 
-  if (WORDS_BIG_ENDIAN)
+  if (WORDS_BIG_ENDIAN || TARGET_PREFIXED)
emit_move_insn (op0, gen_lowpart (TImode, pti_reg));
   else
{
-- 
2.17.1



[PATCH][rs6000][PR92379] fix UB shift of 64-bit type by 64 bits

2020-03-13 Thread Aaron Sawdey via Gcc-patches
This is a fix for PR92379. Passes regstrap on ppc64le. Pre-approved by 
Segher, committing after posting.


2020-03-13  Aaron Sawdey 

    PR target/92379
    * config/rs6000/rs6000.c (num_insns_constant_multi) Don't shift a
    64-bit value by 64 bits (UB).

diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index 24598aff663..5798f924472 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -5612,7 +5612,10 @@ num_insns_constant_multi (HOST_WIDE_INT value, 
machine_mode mode)

   && rs6000_is_valid_and_mask (GEN_INT (low), DImode))
 insns = 2;
   total += insns;
-  value >>= BITS_PER_WORD;
+  /* If BITS_PER_WORD is the number of bits in HOST_WIDE_INT, doing
+     it all at once would be UB. */
+  value >>= (BITS_PER_WORD - 1);
+  value >>= 1;
 }
   return total;
 }







Re: [PATCH] Use movmem optab to attempt inline expansion of __builtin_memmove()

2019-10-03 Thread Aaron Sawdey
On 10/2/19 5:44 PM, Aaron Sawdey wrote:
> On 10/2/19 5:35 PM, Jakub Jelinek wrote:
>> On Wed, Oct 02, 2019 at 09:21:23AM -0500, Aaron Sawdey wrote:
>>>>> 2019-09-27  Aaron Sawdey 
>>>>>
>>>>>   * builtins.c (expand_builtin_memory_copy_args): Add might_overlap parm.
>>>>>   (expand_builtin_memcpy): Use might_overlap parm.
>>>>>   (expand_builtin_mempcpy_args): Use might_overlap parm.
>>>>>   (expand_builtin_memmove): Call expand_builtin_memory_copy_args.
>>>>>   (expand_builtin_memory_copy_args): Add might_overlap parm.
>>>>>   * expr.c (emit_block_move_via_cpymem): Rename to
>>>>>   emit_block_move_via_pattern, add might_overlap parm, use cpymem
>>>>>   or movmem optab as appropriate.
>>>>>   (emit_block_move_hints): Add might_overlap parm, do the right
>>>>>   thing for might_overlap==true.
>>>>>   * expr.h (emit_block_move_hints): Update prototype.
>>
>>> @@ -1622,13 +1624,30 @@
>>>set_mem_size (y, const_size);
>>>  }
>>>
>>> -  if (CONST_INT_P (size) && can_move_by_pieces (INTVAL (size), align))
>>> +  bool pieces_ok = can_move_by_pieces (INTVAL (size), align);
>>> +  bool pattern_ok = false;
>>> +
>>> +  if (!CONST_INT_P (size) || !pieces_ok || might_overlap)
>> ...
>>
>> This change broke rtl checking bootstrap.
>> You can't use INTVAL on size that isn't CONST_INT_P.
>>
>> Fixed thusly, bootstrapped/regtested on x86_64-linux and i686-linux,
>> committed to trunk as obvious:
> 
> Jakub,
>   Sorry about that! Now that you point it out, it's obvious. But what it means
> for me is that I need to be in the habit of bootstrapping with 
> --enable-checking=rtl
> when I make these changes.

I stared at this for a bit and came up with a slightly cleaner fix that is one 
less line:

2019-10-03  Aaron Sawdey 

* expr.c (emit_block_move_hints): Slightly cleaner fix to
can_move_by_pieces issue.

Index: gcc/expr.c
===
--- gcc/expr.c  (revision 276516)
+++ gcc/expr.c  (working copy)
@@ -1624,9 +1624,8 @@
   set_mem_size (y, const_size);
 }

-  bool pieces_ok = false;
-  if (CONST_INT_P (size))
-pieces_ok = can_move_by_pieces (INTVAL (size), align);
+  bool pieces_ok = CONST_INT_P (size)
+&& can_move_by_pieces (INTVAL (size), align);
   bool pattern_ok = false;

   if (!pieces_ok || might_overlap)

Bootstrap/regtest (with --enable-checking=yes,rtl,tree) ok on ppc64le (power9),
committed as obvious.

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain


Re: [PATCH] Use movmem optab to attempt inline expansion of __builtin_memmove()

2019-10-02 Thread Aaron Sawdey
On 10/2/19 5:35 PM, Jakub Jelinek wrote:
> On Wed, Oct 02, 2019 at 09:21:23AM -0500, Aaron Sawdey wrote:
>>>> 2019-09-27  Aaron Sawdey 
>>>>
>>>>* builtins.c (expand_builtin_memory_copy_args): Add might_overlap parm.
>>>>(expand_builtin_memcpy): Use might_overlap parm.
>>>>(expand_builtin_mempcpy_args): Use might_overlap parm.
>>>>(expand_builtin_memmove): Call expand_builtin_memory_copy_args.
>>>>(expand_builtin_memory_copy_args): Add might_overlap parm.
>>>>* expr.c (emit_block_move_via_cpymem): Rename to
>>>>emit_block_move_via_pattern, add might_overlap parm, use cpymem
>>>>or movmem optab as appropriate.
>>>>(emit_block_move_hints): Add might_overlap parm, do the right
>>>>thing for might_overlap==true.
>>>>* expr.h (emit_block_move_hints): Update prototype.
> 
>> @@ -1622,13 +1624,30 @@
>>set_mem_size (y, const_size);
>>  }
>>
>> -  if (CONST_INT_P (size) && can_move_by_pieces (INTVAL (size), align))
>> +  bool pieces_ok = can_move_by_pieces (INTVAL (size), align);
>> +  bool pattern_ok = false;
>> +
>> +  if (!CONST_INT_P (size) || !pieces_ok || might_overlap)
> ...
> 
> This change broke rtl checking bootstrap.
> You can't use INTVAL on size that isn't CONST_INT_P.
> 
> Fixed thusly, bootstrapped/regtested on x86_64-linux and i686-linux,
> committed to trunk as obvious:

Jakub,
  Sorry about that! Now that you point it out, it's obvious. But what it means
for me is that I need to be in the habit of bootstrapping with 
--enable-checking=rtl
when I make these changes.

  Aaron

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain


Re: [PATCH] Use movmem optab to attempt inline expansion of __builtin_memmove()

2019-10-02 Thread Aaron Sawdey
On 10/1/19 4:45 PM, Jeff Law wrote:
> On 9/27/19 12:23 PM, Aaron Sawdey wrote:
>> This is the third piece of my effort to improve inline expansion of memmove. 
>> The
>> first two parts I posted back in June fixed the names of the optab entries
>> involved so that optab cpymem is used for memcpy() and optab movmem is used 
>> for
>> memmove(). This piece adds support for actually attempting to invoke the 
>> movmem
>> optab to do inline expansion of __builtin_memmove().
>>
>> Because what needs to be done for memmove() is very similar to memcpy(), I 
>> have
>> just added a bool parm "might_overlap" to several of the functions involved 
>> so
>> the same functions can handle both. The name might_overlap comes from the 
>> fact
>> that if we still have a memmove() call at expand, this means
>> gimple_fold_builtin_memory_op() was not able to prove that the source and
>> destination do not overlap.
>>
>> There are a few places where might_overlap gets used to keep us from trying 
>> to
>> use the by-pieces infrastructure or generate a copy loop, as neither of those
>> things will work correctly if source and destination overlap.
>>
>> I've restructured things slightly in emit_block_move_hints() so that we can
>> try the pattern first if we already know that by-pieces won't work. This way
>> we can bail out immediately in the might_overlap case.
>>
>> Bootstrap/regtest passed on ppc64le, in progress on x86_64. If everything 
>> passes,
>> is this ok for trunk?
>>
>>
>> 2019-09-27  Aaron Sawdey 
>>
>>  * builtins.c (expand_builtin_memory_copy_args): Add might_overlap parm.
>>  (expand_builtin_memcpy): Use might_overlap parm.
>>  (expand_builtin_mempcpy_args): Use might_overlap parm.
>>  (expand_builtin_memmove): Call expand_builtin_memory_copy_args.
>>  (expand_builtin_memory_copy_args): Add might_overlap parm.
>>  * expr.c (emit_block_move_via_cpymem): Rename to
>>  emit_block_move_via_pattern, add might_overlap parm, use cpymem
>>  or movmem optab as appropriate.
>>  (emit_block_move_hints): Add might_overlap parm, do the right
>>  thing for might_overlap==true.
>>  * expr.h (emit_block_move_hints): Update prototype.
>>
>>
>>
>>
>> Index: gcc/builtins.c
>> ===
>> --- gcc/builtins.c   (revision 276131)
>> +++ gcc/builtins.c   (working copy)
>> @@ -3894,10 +3897,11 @@
>>  _max_size);
>>src_str = c_getstr (src);
>>
>> -  /* If SRC is a string constant and block move would be done
>> - by pieces, we can avoid loading the string from memory
>> - and only stored the computed constants.  */
>> -  if (src_str
>> +  /* If SRC is a string constant and block move would be done by
>> + pieces, we can avoid loading the string from memory and only
>> + stored the computed constants.  I'm not sure if the by pieces
>> + method works if src/dest are overlapping, so avoid that case.  */
>> +  if (src_str && !might_overlap
> I don't think you need the check here.  c_getstr, when it returns
> somethign useful is going to be returning a string constant.  Think read
> only literals here.  I'm pretty sure overlap isn't going to be possible.

After some digging, I agree -- c_getstr() return a string constant and
that is then used to generate stores of constants.

I've fixed the other issues and also fixed emit_block_move_via_pattern() to
make use of pieces_ok instead of calling can_move_by_pieces() a second
time. The patch I'm actually committing is below.

Thanks for the review!

  Aaron

Index: gcc/builtins.c
===
--- gcc/builtins.c  (revision 276131)
+++ gcc/builtins.c  (working copy)
@@ -127,7 +127,8 @@
 static rtx expand_builtin_memcpy (tree, rtx);
 static rtx expand_builtin_memory_copy_args (tree dest, tree src, tree len,
rtx target, tree exp,
-   memop_ret retmode);
+   memop_ret retmode,
+   bool might_overlap);
 static rtx expand_builtin_memmove (tree, rtx);
 static rtx expand_builtin_mempcpy (tree, rtx);
 static rtx expand_builtin_mempcpy_args (tree, tree, tree, rtx, tree, 
memop_ret);
@@ -3790,7 +3791,7 @@
   check_memop_access (exp, dest, src, len);

   return expand_builtin_memory_copy_args (dest, src, len, target, exp,
- /*retmode=*/ RETURN_BEGIN);
+ 

[PATCH, RS6000] Add movmemsi pattern for inline expansion of memmove()

2019-09-30 Thread Aaron Sawdey
This patch uses the support added in the patch I posted last week for actually 
doing
inline expansion of memmove(). I've added a might_overlap parameter to 
expand_block_move()
to tell it when it must make sure to handle overlapping moves. I changed the 
code to
save up the generated rtx for both loads and stores instead of just stores. In 
the
might_overlap==true case, if we get to MAX_MOVE_REG and the move is not done 
yet, then
we bail out and return false. So what this can now do is inline expand any 
memmove()
that can be done in 4 loads followed by 4 stores. It will use lxv/stxv if 
size/alignment
allows, otherwise it will use unaligned integer loads/stores. So it can expand 
most
memmove() up to 32 bytes, and some that are 33-64 bytes if the arguments are 16 
byte
aligned.

I've also removed the code from expand_block_move() for dealing with
mode==BLKmode because I don't believe that can happen. The big if construct that
figures out which size we are going to use has a plain else on it, and every
clause in it sets mode to something other than BLKmode. So I removed that code
to simplify things and just left a gcc_assert(mode != BLKmode).

Regtest in progress on ppc64le (power9), if tests are ok, is this ok for trunk
after the movmem optab patch posted last week is approved?

Thanks!
   Aaron

2019-09-30  Aaron Sawdey 

* config/rs6000/rs6000-protos.h (expand_block_move): Change prototype.
* config/rs6000/rs6000-string.c (expand_block_move): Add might_overlap 
parm.
* config/rs6000/rs6000.md (movmemsi): Add new pattern.
(cpymemsi): Add might_overlap parm to expand_block_move() call.



Index: gcc/config/rs6000/rs6000-protos.h
===
--- gcc/config/rs6000/rs6000-protos.h   (revision 276131)
+++ gcc/config/rs6000/rs6000-protos.h   (working copy)
@@ -69,7 +69,7 @@
 extern void rs6000_generate_float2_double_code (rtx, rtx, rtx);
 extern void rs6000_generate_vsigned2_code (bool, rtx, rtx, rtx);
 extern int expand_block_clear (rtx[]);
-extern int expand_block_move (rtx[]);
+extern int expand_block_move (rtx[], bool);
 extern bool expand_block_compare (rtx[]);
 extern bool expand_strn_compare (rtx[], int);
 extern bool rs6000_is_valid_mask (rtx, int *, int *, machine_mode);
Index: gcc/config/rs6000/rs6000-string.c
===
--- gcc/config/rs6000/rs6000-string.c   (revision 276131)
+++ gcc/config/rs6000/rs6000-string.c   (working copy)
@@ -2719,7 +2719,7 @@
 #define MAX_MOVE_REG 4

 int
-expand_block_move (rtx operands[])
+expand_block_move (rtx operands[], bool might_overlap)
 {
   rtx orig_dest = operands[0];
   rtx orig_src = operands[1];
@@ -2730,6 +2730,7 @@
   int bytes;
   int offset;
   int move_bytes;
+  rtx loads[MAX_MOVE_REG];
   rtx stores[MAX_MOVE_REG];
   int num_reg = 0;

@@ -2817,47 +2818,35 @@
  gen_func.mov = gen_movqi;
}

+  /* Mode is always set to something other than BLKmode by one of the
+cases of the if statement above.  */
+  gcc_assert (mode != BLKmode);
+
   src = adjust_address (orig_src, mode, offset);
   dest = adjust_address (orig_dest, mode, offset);

-  if (mode != BLKmode)
-   {
- rtx tmp_reg = gen_reg_rtx (mode);
+  rtx tmp_reg = gen_reg_rtx (mode);
+
+  loads[num_reg]= (*gen_func.mov) (tmp_reg, src);
+  stores[num_reg++] = (*gen_func.mov) (dest, tmp_reg);

- emit_insn ((*gen_func.mov) (tmp_reg, src));
- stores[num_reg++] = (*gen_func.mov) (dest, tmp_reg);
-   }
+  /* If we didn't succeed in doing it in one pass, we can't do it in the
+might_overlap case.  Bail out and return failure.  */
+  if (might_overlap && num_reg >= MAX_MOVE_REG
+ && bytes > move_bytes)
+   return 0;

-  if (mode == BLKmode || num_reg >= MAX_MOVE_REG || bytes == move_bytes)
+  /* Emit loads and stores saved up.  */
+  if (num_reg >= MAX_MOVE_REG || bytes == move_bytes)
{
  int i;
  for (i = 0; i < num_reg; i++)
+   emit_insn (loads[i]);
+ for (i = 0; i < num_reg; i++)
emit_insn (stores[i]);
  num_reg = 0;
}
-
-  if (mode == BLKmode)
-   {
- /* Move the address into scratch registers.  The movmemsi
-patterns require zero offset.  */
- if (!REG_P (XEXP (src, 0)))
-   {
- rtx src_reg = copy_addr_to_reg (XEXP (src, 0));
- src = replace_equiv_address (src, src_reg);
-   }
- set_mem_size (src, move_bytes);
-
- if (!REG_P (XEXP (dest, 0)))
-   {
- rtx dest_reg = copy_addr_to_reg (XEXP (dest, 0));
- dest = replace_equiv_address (dest, dest_reg);
-   }
- set_mem_size (dest, move_bytes);
-
- emit_insn ((*gen_func.movmemsi) (dest, src,
-

[PATCH] Use movmem optab to attempt inline expansion of __builtin_memmove()

2019-09-27 Thread Aaron Sawdey
This is the third piece of my effort to improve inline expansion of memmove. The
first two parts I posted back in June fixed the names of the optab entries
involved so that optab cpymem is used for memcpy() and optab movmem is used for
memmove(). This piece adds support for actually attempting to invoke the movmem
optab to do inline expansion of __builtin_memmove().

Because what needs to be done for memmove() is very similar to memcpy(), I have
just added a bool parm "might_overlap" to several of the functions involved so
the same functions can handle both. The name might_overlap comes from the fact
that if we still have a memmove() call at expand, this means
gimple_fold_builtin_memory_op() was not able to prove that the source and
destination do not overlap.

There are a few places where might_overlap gets used to keep us from trying to
use the by-pieces infrastructure or generate a copy loop, as neither of those
things will work correctly if source and destination overlap.

I've restructured things slightly in emit_block_move_hints() so that we can
try the pattern first if we already know that by-pieces won't work. This way
we can bail out immediately in the might_overlap case.

Bootstrap/regtest passed on ppc64le, in progress on x86_64. If everything 
passes,
is this ok for trunk?


2019-09-27  Aaron Sawdey 

* builtins.c (expand_builtin_memory_copy_args): Add might_overlap parm.
(expand_builtin_memcpy): Use might_overlap parm.
(expand_builtin_mempcpy_args): Use might_overlap parm.
(expand_builtin_memmove): Call expand_builtin_memory_copy_args.
(expand_builtin_memory_copy_args): Add might_overlap parm.
* expr.c (emit_block_move_via_cpymem): Rename to
emit_block_move_via_pattern, add might_overlap parm, use cpymem
or movmem optab as appropriate.
(emit_block_move_hints): Add might_overlap parm, do the right
thing for might_overlap==true.
* expr.h (emit_block_move_hints): Update prototype.




Index: gcc/builtins.c
===
--- gcc/builtins.c  (revision 276131)
+++ gcc/builtins.c  (working copy)
@@ -127,7 +127,8 @@
 static rtx expand_builtin_memcpy (tree, rtx);
 static rtx expand_builtin_memory_copy_args (tree dest, tree src, tree len,
rtx target, tree exp,
-   memop_ret retmode);
+   memop_ret retmode,
+   bool might_overlap);
 static rtx expand_builtin_memmove (tree, rtx);
 static rtx expand_builtin_mempcpy (tree, rtx);
 static rtx expand_builtin_mempcpy_args (tree, tree, tree, rtx, tree, 
memop_ret);
@@ -3790,7 +3791,7 @@
   check_memop_access (exp, dest, src, len);

   return expand_builtin_memory_copy_args (dest, src, len, target, exp,
- /*retmode=*/ RETURN_BEGIN);
+ /*retmode=*/ RETURN_BEGIN, false);
 }

 /* Check a call EXP to the memmove built-in for validity.
@@ -3797,7 +3798,7 @@
Return NULL_RTX on both success and failure.  */

 static rtx
-expand_builtin_memmove (tree exp, rtx)
+expand_builtin_memmove (tree exp, rtx target)
 {
   if (!validate_arglist (exp,
 POINTER_TYPE, POINTER_TYPE, INTEGER_TYPE, VOID_TYPE))
@@ -3809,7 +3810,8 @@

   check_memop_access (exp, dest, src, len);

-  return NULL_RTX;
+  return expand_builtin_memory_copy_args (dest, src, len, target, exp,
+ /*retmode=*/ RETURN_BEGIN, true);
 }

 /* Expand a call EXP to the mempcpy builtin.
@@ -3858,7 +3860,8 @@

 static rtx
 expand_builtin_memory_copy_args (tree dest, tree src, tree len,
-rtx target, tree exp, memop_ret retmode)
+rtx target, tree exp, memop_ret retmode,
+bool might_overlap)
 {
   const char *src_str;
   unsigned int src_align = get_pointer_alignment (src);
@@ -3894,10 +3897,11 @@
_max_size);
   src_str = c_getstr (src);

-  /* If SRC is a string constant and block move would be done
- by pieces, we can avoid loading the string from memory
- and only stored the computed constants.  */
-  if (src_str
+  /* If SRC is a string constant and block move would be done by
+ pieces, we can avoid loading the string from memory and only
+ stored the computed constants.  I'm not sure if the by pieces
+ method works if src/dest are overlapping, so avoid that case.  */
+  if (src_str && !might_overlap
   && CONST_INT_P (len_rtx)
   && (unsigned HOST_WIDE_INT) INTVAL (len_rtx) <= strlen (src_str) + 1
   && can_store_by_pieces (INTVAL (len_rtx), builtin_memcpy_read_str,
@@ -3922,7 +3926,7 @@
   && (retmode == RETURN_BEGIN || target == const0_rtx))

[PATCH] Add movmem optab entry back in for overlapping moves

2019-07-02 Thread Aaron Sawdey
This is the second piece for allowing inline expansion of memmove. Now that
the old movmem patterns have all been renamed to cpymem, the movmem optab can
be added back.

Next piece will be: add support for __builtin_memmove() to use the movmem optab 
and
associated patterns.

This patch passes bootstrap/regtest on ppc64le and x86_64. Ok for trunk?

2019-07-02  Aaron Sawdey  

* optabs.def (movmem_optab): Add movmem back for memmove().
* doc/md.texi: Add description of movmem pattern for overlapping move.


Index: gcc/doc/md.texi
===
--- gcc/doc/md.texi (revision 272762)
+++ gcc/doc/md.texi (working copy)
@@ -6237,6 +6237,42 @@
 overlap. These patterns are used to do inline expansion of
 @code{__builtin_memcpy}.

+@cindex @code{movmem@var{m}} instruction pattern
+@item @samp{movmem@var{m}}
+Block move instruction.  The destination and source blocks of memory
+are the first two operands, and both are @code{mem:BLK}s with an
+address in mode @code{Pmode}.
+
+The number of bytes to copy is the third operand, in mode @var{m}.
+Usually, you specify @code{Pmode} for @var{m}.  However, if you can
+generate better code knowing the range of valid lengths is smaller than
+those representable in a full Pmode pointer, you should provide
+a pattern with a
+mode corresponding to the range of values you can handle efficiently
+(e.g., @code{QImode} for values in the range 0--127; note we avoid numbers
+that appear negative) and also a pattern with @code{Pmode}.
+
+The fourth operand is the known shared alignment of the source and
+destination, in the form of a @code{const_int} rtx.  Thus, if the
+compiler knows that both source and destination are word-aligned,
+it may provide the value 4 for this operand.
+
+Optional operands 5 and 6 specify expected alignment and size of block
+respectively.  The expected alignment differs from alignment in operand 4
+in a way that the blocks are not required to be aligned according to it in
+all cases. This expected alignment is also in bytes, just like operand 4.
+Expected size, when unknown, is set to @code{(const_int -1)}.
+
+Descriptions of multiple @code{movmem@var{m}} patterns can only be
+beneficial if the patterns for smaller modes have fewer restrictions
+on their first, second and fourth operands.  Note that the mode @var{m}
+in @code{movmem@var{m}} does not impose any restriction on the mode of
+individually copied data units in the block.
+
+The @code{movmem@var{m}} patterns must correctly handle the case where
+the source and destination strings overlap. These patterns are used to
+do inline expansion of @code{__builtin_memmove}.
+
 @cindex @code{movstr} instruction pattern
 @item @samp{movstr}
 String copy instruction, with @code{stpcpy} semantics.  Operand 0 is
Index: gcc/optabs.def
===
--- gcc/optabs.def  (revision 272762)
+++ gcc/optabs.def  (working copy)
@@ -257,6 +257,7 @@
 OPTAB_D (cmpstr_optab, "cmpstr$a")
 OPTAB_D (cmpstrn_optab, "cmpstrn$a")
 OPTAB_D (cpymem_optab, "cpymem$a")
+OPTAB_D (movmem_optab, "movmem$a")
 OPTAB_D (setmem_optab, "setmem$a")
 OPTAB_D (strlen_optab, "strlen$a")


-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



Re: [PATCH 32/30] Document movmem/cpymem changes in gcc-10/changes.html

2019-06-27 Thread Aaron Sawdey
On 6/25/19 4:43 PM, Jeff Law wrote:
> On 6/25/19 2:22 PM, acsaw...@linux.ibm.com wrote:
>> From: Aaron Sawdey 
>>
>>  * builtins.c (get_memory_rtx): Fix comment.
>>  * optabs.def (movmem_optab): Change to cpymem_optab.
>>  * expr.c (emit_block_move_via_cpymem): Change movmem to cpymem.
>>  (emit_block_move_hints): Change movmem to cpymem.
>>  * defaults.h: Change movmem to cpymem.
>>  * targhooks.c (get_move_ratio): Change movmem to cpymem.
>>  (default_use_by_pieces_infrastructure_p): Ditto.
> So I think you're missing an update to the RTL/MD documentation.  This
> is also likely to cause problems for any out-of-tree ports, so it's
> probably worth a mention in the gcc-10 changes, which will need to be
> created (in CVS no less, ugh).
> 
> I think the stuff posted to-date is fine, but it shouldn't go in without
> the corresponding docs and gcc-10 changes updates.

Here is the corresponding documentation change for gcc-10/changes.html.

OK for trunk?

Thanks,
Aaron




Index: changes.html
===
RCS file: /cvs/gcc/wwwdocs/htdocs/gcc-10/changes.html,v
retrieving revision 1.4
diff -r1.4 changes.html
139c139,149
< 
---
> Other significant improvements
> 
>   
> To allow inline expansion of both memcpy
> and memmove, the existing movmem instruction
> patterns used for non-overlapping memory copies have been renamed to
> cpymem. The movmem name is now used
> for overlapping memory moves, consistent with the
> library functions memcpy and memmove.
>   
> 

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain


[PATCH 31/30] Update documentation for movmem to cpymem change

2019-06-26 Thread Aaron Sawdey
On 6/25/19 4:43 PM, Jeff Law wrote:
> On 6/25/19 2:22 PM, acsaw...@linux.ibm.com wrote:
>> From: Aaron Sawdey 
>>
>>  * builtins.c (get_memory_rtx): Fix comment.
>>  * optabs.def (movmem_optab): Change to cpymem_optab.
>>  * expr.c (emit_block_move_via_cpymem): Change movmem to cpymem.
>>  (emit_block_move_hints): Change movmem to cpymem.
>>  * defaults.h: Change movmem to cpymem.
>>  * targhooks.c (get_move_ratio): Change movmem to cpymem.
>>  (default_use_by_pieces_infrastructure_p): Ditto.
> So I think you're missing an update to the RTL/MD documentation.  This
> is also likely to cause problems for any out-of-tree ports, so it's
> probably worth a mention in the gcc-10 changes, which will need to be
> created (in CVS no less, ugh).
> 
> I think the stuff posted to-date is fine, but it shouldn't go in without
> the corresponding docs and gcc-10 changes updates.
This would be my proposed patch to update the documentation. I'll also work
out what the entry in the gcc 10 changes and post that for review before
this all goes in.

OK for trunk along with the other 30 patches?

Thanks,
Aaron



* doc/md.texi: Change movmem to cpymem and update description to match.
* doc/rtl.texi: Change movmem to cpymem.
* target.def (use_by_pieces_infrastructure_p): Change movmem to cpymem.
* doc/tm.texi: Regenerate.
---
 gcc/doc/md.texi  | 26 ++
 gcc/doc/rtl.texi |  2 +-
 gcc/doc/tm.texi  |  4 ++--
 gcc/target.def   |  4 ++--
 4 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index b45b4be..3f9d545 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -6200,13 +6200,13 @@ This pattern is not allowed to @code{FAIL}.
 @item @samp{one_cmpl@var{m}2}
 Store the bitwise-complement of operand 1 into operand 0.

-@cindex @code{movmem@var{m}} instruction pattern
-@item @samp{movmem@var{m}}
-Block move instruction.  The destination and source blocks of memory
+@cindex @code{cpymem@var{m}} instruction pattern
+@item @samp{cpymem@var{m}}
+Block copy instruction.  The destination and source blocks of memory
 are the first two operands, and both are @code{mem:BLK}s with an
 address in mode @code{Pmode}.

-The number of bytes to move is the third operand, in mode @var{m}.
+The number of bytes to copy is the third operand, in mode @var{m}.
 Usually, you specify @code{Pmode} for @var{m}.  However, if you can
 generate better code knowing the range of valid lengths is smaller than
 those representable in a full Pmode pointer, you should provide
@@ -6226,14 +6226,16 @@ in a way that the blocks are not required to be aligned 
according to it in
 all cases. This expected alignment is also in bytes, just like operand 4.
 Expected size, when unknown, is set to @code{(const_int -1)}.

-Descriptions of multiple @code{movmem@var{m}} patterns can only be
+Descriptions of multiple @code{cpymem@var{m}} patterns can only be
 beneficial if the patterns for smaller modes have fewer restrictions
 on their first, second and fourth operands.  Note that the mode @var{m}
-in @code{movmem@var{m}} does not impose any restriction on the mode of
-individually moved data units in the block.
+in @code{cpymem@var{m}} does not impose any restriction on the mode of
+individually copied data units in the block.

-These patterns need not give special consideration to the possibility
-that the source and destination strings might overlap.
+The @code{cpymem@var{m}} patterns need not give special consideration
+to the possibility that the source and destination strings might
+overlap. These patterns are used to do inline expansion of
+@code{__builtin_memcpy}.

 @cindex @code{movstr} instruction pattern
 @item @samp{movstr}
@@ -6254,7 +6256,7 @@ given as a @code{mem:BLK} whose address is in mode 
@code{Pmode}.  The
 number of bytes to set is the second operand, in mode @var{m}.  The value to
 initialize the memory with is the third operand. Targets that only support the
 clearing of memory should reject any value that is not the constant 0.  See
-@samp{movmem@var{m}} for a discussion of the choice of mode.
+@samp{cpymem@var{m}} for a discussion of the choice of mode.

 The fourth operand is the known alignment of the destination, in the form
 of a @code{const_int} rtx.  Thus, if the compiler knows that the
@@ -6272,13 +6274,13 @@ Operand 9 is the probable maximal size (i.e.@: we 
cannot rely on it for
 correctness, but it can be used for choosing proper code sequence for a
 given size).

-The use for multiple @code{setmem@var{m}} is as for @code{movmem@var{m}}.
+The use for multiple @code{setmem@var{m}} is as for @code{cpymem@var{m}}.

 @cindex @code{cmpstrn@var{m}} instruction pattern
 @item @samp{cmpstrn@var{m}}
 String compare instruction, with five operands.  Operand 0 is the output;
 it has mode @var{m}.  The remaining four operands are like the op

Re: [PATCH] sched-ebb.c: avoid moving table jumps (PR rtl-optimization/88423)

2019-02-18 Thread Aaron Sawdey


On 2/18/19 10:41 AM, Alexander Monakov wrote:
> On Mon, 18 Feb 2019, Aaron Sawdey wrote:
> 
>> The code in emit_case_dispatch_table() will very clearly always emit 
>> branch/label/jumptable_data/barrier
>> so this does need to be handled. So, yes tablejump always looks like this, 
>> and also yes it seems to be
>> ripe ground for logic bugs, we have 88308, 88347, 88423 all related to it.
>>
>> In the long term it might be nice to use a general mechanism 
>> (SCHED_GROUP_P?) for handling the label and jump
>> table data that follow a case branch using jump table.
>>
>> But for now in stage 4, I think the right way to fix this is with the patch 
>> that Segher posted earlier.
>> If regtest passes (x86_64 and ppc64/ppc32), ok for trunk?
> 
> How making an assert more permissive is "the right way" here?
> As already mentioned, without the assert we'd move a USE of the register with
> function return value to an unreachable block, which would be incorrect.
> 
> Do you anticipate issues with the sched-deps patch?

Alexander,
 I see you are allowing it to see the barrier as if it were right after the 
tablejump.

Are you saying that the motion of the tablejump is happening because the 
scheduler does not see
the barrier (because it does not follow immediately after) and thus decides it 
can move instructions
to the other side of the tablejump? I agree that is incorrect and is asking for 
other hidden problems.

It would be nice if the tablejump, jump table label, jump table data, and 
barrier were all one indivisible
unit somehow.

In the meantime, can someone approve Alexander's patch?

Thanks,
   Aaron



-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



Re: [PATCH] sched-ebb.c: avoid moving table jumps (PR rtl-optimization/88423)

2019-02-18 Thread Aaron Sawdey
The code in emit_case_dispatch_table() will very clearly always emit 
branch/label/jumptable_data/barrier
so this does need to be handled. So, yes tablejump always looks like this, and 
also yes it seems to be
ripe ground for logic bugs, we have 88308, 88347, 88423 all related to it.

In the long term it might be nice to use a general mechanism (SCHED_GROUP_P?) 
for handling the label and jump
table data that follow a case branch using jump table.

But for now in stage 4, I think the right way to fix this is with the patch 
that Segher posted earlier.
If regtest passes (x86_64 and ppc64/ppc32), ok for trunk?

2019-02-18  Aaron Sawdey  

PR rtl-optimization/88347
* schedule-ebb.c (begin_move_insn): Apply Segher's patch to handle
a jump table before the barrier.


On 1/24/19 9:43 AM, Alexander Monakov wrote:
> On Wed, 23 Jan 2019, Alexander Monakov wrote:
> 
>> It appears that sched-deps tries to take notice of a barrier after a jump, 
>> but
>> similarly to sched-ebb doesn't anticipate that for a tablejump the barrier 
>> will
>> appear after two more insns (a code_label and a jump_table_data).
>>
>> If so, it needs a fixup just like the posted change for the assert. I'll 
>> fire up
>> a bootstrap/regtest.
> 
> Updated patch below (now taking into account that NEXT_INSN may give NULL)
> passes bootstrap/regtest on x86_64, also with -fsched2-use-superblocks.
> 
> I'm surprised to learn that a tablejump may be not the final insn in its
> containing basic block.  It certainly seems like a ripe ground for logic
> bugs like this one.  Is it really intentional?
> 
> OK for trunk?
> 
> Thanks.
> Alexander
> 
>   PR rtl-optimization/88347
>   PR rtl-optimization/88423
>   * sched-deps.c (sched_analyze_insn): Take into account that for
>   tablejumps the barrier appears after a label and a jump_table_data.
> 
> --- a/gcc/sched-deps.c
> +++ b/gcc/sched-deps.c
> @@ -3005,6 +3005,11 @@ sched_analyze_insn (struct deps_desc *deps, rtx x, 
> rtx_insn *insn)
>if (JUMP_P (insn))
>  {
>rtx_insn *next = next_nonnote_nondebug_insn (insn);
> +  /* ??? For tablejumps, the barrier may appear not immediately after
> + the jump, but after a label and a jump_table_data insn.  */
> +  if (next && LABEL_P (next) && NEXT_INSN (next)
> +   && JUMP_TABLE_DATA_P (NEXT_INSN (next)))
> + next = NEXT_INSN (NEXT_INSN (next));
>if (next && BARRIER_P (next))
>   reg_pending_barrier = MOVE_BARRIER;
>else
> 

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



[PATCH] PR rtl-optimization/88308 Update LABEL_NUSES in move_insn_for_shrink_wrap

2019-02-13 Thread Aaron Sawdey
I've tracked pr/88308 down to move_insn_for_shrink_wrap(). This function moves 
an insn
from one BB to another by copying it and deleting the old one. Unfortunately 
this does
the LABEL_NUSES count on labels referenced because deleting the old instruction 
decrements
the count and nothing in this function is incrementing the count.

It just happens that on rs6000 with -m64, force_const_mem() gets called on the 
address
and that sets LABEL_PRESERVE_P on the label which prevents it from being 
deleted. For
whatever reason this doesn't happen in a -m32 compilation, and the label and 
it's associated
jump table data are deleted. This later causes the ICE when the dwarf code 
tries to look
at the label.

Segher and I came up with 3 possible solutions to this:

1) Don't let move_insn_for_shrink_wrap try to move insns with label_ref in them.
2) Call mark_jump_label() on the copied instruction to fix up the ref counts.
3) Make the function actually move the insn instead of copying/deleting it.

It seemed like option 2 was the best thing for stage 4 as it is not inhibiting 
anything
and is just doing a fixup of the ref count.

OK for trunk after regtesting on ppc64be (32/64) and x86_64?

Thanks!
   Aaron


2019-02-13  Aaron Sawdey  

* shrink-wrap.c (move_insn_for_shrink_wrap): Fix LABEL_NUSES counts
on copied instruction.


Index: gcc/shrink-wrap.c
===
--- gcc/shrink-wrap.c   (revision 268783)
+++ gcc/shrink-wrap.c   (working copy)
@@ -414,7 +414,12 @@
   dead_debug_insert_temp (debug, DF_REF_REGNO (def), insn,
  DEBUG_TEMP_BEFORE_WITH_VALUE);

-  emit_insn_after (PATTERN (insn), bb_note (bb));
+  rtx_insn *insn_copy = emit_insn_after (PATTERN (insn), bb_note (bb));
+  /* Update the LABEL_NUSES count on any referenced labels. The ideal
+ solution here would be to actually move the instruction instead
+ of copying/deleting it as this loses some notations on the
+ insn.  */
+  mark_jump_label (PATTERN (insn), insn_copy, 0);
   delete_insn (insn);
   return true;
 }


-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



Re: [PATCH, rs6000] PR target/89112 put branch probabilities on branches generated by inline expansion

2019-02-08 Thread Aaron Sawdey
Missed two more conditional branches created by inline expansion that should 
have had
branch probability notes.

2019-02-08  Aaron Sawdey  

* config/rs6000/rs6000-string.c (expand_compare_loop,
expand_block_compare): Insert REG_BR_PROB notes in inline expansion of
memcmp/strncmp.

Index: gcc/config/rs6000/rs6000-string.c
===
--- gcc/config/rs6000/rs6000-string.c   (revision 268547)
+++ gcc/config/rs6000/rs6000-string.c   (working copy)
@@ -1525,6 +1525,7 @@
  else
j = emit_jump_insn (gen_bdnztf_si (fc_loop, ctr, ctr,
   eqrtx, cond));
+ add_reg_br_prob_note (j, profile_probability::likely ());
  JUMP_LABEL (j) = fc_loop;
  LABEL_NUSES (fc_loop) += 1;

@@ -1897,6 +1898,7 @@
  rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx,
 cvt_ref, pc_rtx);
  rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
+ add_reg_br_prob_note (j, profile_probability::likely ());
  JUMP_LABEL (j) = convert_label;
  LABEL_NUSES (convert_label) += 1;
}

Pre-approved by Segher for trunk and backport to 8, will commit after regtest 
completes.

  Aaron

On 2/4/19 1:06 PM, Aaron Sawdey wrote:
> This is the second part of the fix for 89112, fixing the conditions that 
> caused it to happen.
> This patch adds REG_BR_PROB notes to the branches generated by inline 
> expansion of memcmp
> and strncmp. This prevents any of the code from being marked as cold and 
> moved to the end
> of the function, which is what caused the long branches in 89112. With this 
> patch, the test
> case for 89112 does not have any long branches within the expansion of 
> memcmp, and the code
> for each memcmp is contiguous.
> 
> OK for trunk and 8 backport if bootstrap/regtest passes?
> 
> Thanks!
> 
>Aaron
> 
> 2019-02-04  Aaron Sawdey  
> 
>   * config/rs6000/rs6000-string.c (do_ifelse, expand_cmp_vec_sequence,
>   expand_compare_loop, expand_block_compare_gpr,
>   expand_strncmp_align_check, expand_strncmp_gpr_sequence): add branch
>   probability.
> 
> 
> Index: gcc/config/rs6000/rs6000-string.c
> ===
> --- gcc/config/rs6000/rs6000-string.c (revision 268522)
> +++ gcc/config/rs6000/rs6000-string.c (working copy)
> @@ -35,6 +35,8 @@
>  #include "expr.h"
>  #include "output.h"
>  #include "target.h"
> +#include "profile-count.h"
> +#include "predict.h"
> 
>  /* Expand a block clear operation, and return 1 if successful.  Return 0
> if we should let the compiler generate normal code.
> @@ -369,6 +371,7 @@
> B is the second thing to be compared.
> CR is the condition code reg input, or NULL_RTX.
> TRUE_LABEL is the label to branch to if the condition is true.
> +   P is the estimated branch probability for the branch.
> 
> The return value is the CR used for the comparison.
> If CR is null_rtx, then a new register of CMPMODE is generated.
> @@ -377,7 +380,7 @@
> 
>  static void
>  do_ifelse (machine_mode cmpmode, rtx_code comparison,
> -rtx a, rtx b, rtx cr, rtx true_label)
> +rtx a, rtx b, rtx cr, rtx true_label, profile_probability p)
>  {
>gcc_assert ((a == NULL_RTX && b == NULL_RTX && cr != NULL_RTX)
> || (a != NULL_RTX && b != NULL_RTX));
> @@ -395,7 +398,8 @@
>rtx cmp_rtx = gen_rtx_fmt_ee (comparison, VOIDmode, cr, const0_rtx);
> 
>rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, label_ref, pc_rtx);
> -  rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
> +  rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
> +  add_reg_br_prob_note (j, p);
>JUMP_LABEL (j) = true_label;
>LABEL_NUSES (true_label) += 1;
>  }
> @@ -781,7 +785,8 @@
>rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, dst_label);
>rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
>lab_ref, pc_rtx);
> -  rtx j2 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
> +  rtx_insn *j2 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
> +  add_reg_br_prob_note (j2, profile_probability::likely ());
>JUMP_LABEL (j2) = dst_label;
>LABEL_NUSES (dst_label) += 1;
> 
> @@ -1036,7 +1041,7 @@
> 
>/* Difference found is stored here before jump to diff_label.  */
>rtx diff = gen_reg_rtx (word_mode);
> -  rtx j;
> +  rtx_insn *j;
> 
>/* Example of generated code for 35 bytes aligned 1 byte.
> 
> @@ -1120,11 +1125,11 @@
> 

[PATCH, rs6000] PR target/89112 put branch probabilities on branches generated by inline expansion

2019-02-04 Thread Aaron Sawdey
This is the second part of the fix for 89112, fixing the conditions that caused 
it to happen.
This patch adds REG_BR_PROB notes to the branches generated by inline expansion 
of memcmp
and strncmp. This prevents any of the code from being marked as cold and moved 
to the end
of the function, which is what caused the long branches in 89112. With this 
patch, the test
case for 89112 does not have any long branches within the expansion of memcmp, 
and the code
for each memcmp is contiguous.

OK for trunk and 8 backport if bootstrap/regtest passes?

Thanks!

   Aaron

2019-02-04  Aaron Sawdey  

* config/rs6000/rs6000-string.c (do_ifelse, expand_cmp_vec_sequence,
expand_compare_loop, expand_block_compare_gpr,
expand_strncmp_align_check, expand_strncmp_gpr_sequence): add branch
probability.


Index: gcc/config/rs6000/rs6000-string.c
===
--- gcc/config/rs6000/rs6000-string.c   (revision 268522)
+++ gcc/config/rs6000/rs6000-string.c   (working copy)
@@ -35,6 +35,8 @@
 #include "expr.h"
 #include "output.h"
 #include "target.h"
+#include "profile-count.h"
+#include "predict.h"

 /* Expand a block clear operation, and return 1 if successful.  Return 0
if we should let the compiler generate normal code.
@@ -369,6 +371,7 @@
B is the second thing to be compared.
CR is the condition code reg input, or NULL_RTX.
TRUE_LABEL is the label to branch to if the condition is true.
+   P is the estimated branch probability for the branch.

The return value is the CR used for the comparison.
If CR is null_rtx, then a new register of CMPMODE is generated.
@@ -377,7 +380,7 @@

 static void
 do_ifelse (machine_mode cmpmode, rtx_code comparison,
-  rtx a, rtx b, rtx cr, rtx true_label)
+  rtx a, rtx b, rtx cr, rtx true_label, profile_probability p)
 {
   gcc_assert ((a == NULL_RTX && b == NULL_RTX && cr != NULL_RTX)
  || (a != NULL_RTX && b != NULL_RTX));
@@ -395,7 +398,8 @@
   rtx cmp_rtx = gen_rtx_fmt_ee (comparison, VOIDmode, cr, const0_rtx);

   rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, label_ref, pc_rtx);
-  rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
+  rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
+  add_reg_br_prob_note (j, p);
   JUMP_LABEL (j) = true_label;
   LABEL_NUSES (true_label) += 1;
 }
@@ -781,7 +785,8 @@
   rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, dst_label);
   rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
 lab_ref, pc_rtx);
-  rtx j2 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
+  rtx_insn *j2 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
+  add_reg_br_prob_note (j2, profile_probability::likely ());
   JUMP_LABEL (j2) = dst_label;
   LABEL_NUSES (dst_label) += 1;

@@ -1036,7 +1041,7 @@

   /* Difference found is stored here before jump to diff_label.  */
   rtx diff = gen_reg_rtx (word_mode);
-  rtx j;
+  rtx_insn *j;

   /* Example of generated code for 35 bytes aligned 1 byte.

@@ -1120,11 +1125,11 @@
   /* Check for > max_bytes bytes.  We want to bail out as quickly as
 possible if we have to go over to memcmp.  */
   do_ifelse (CCmode, GT, bytes_rtx, GEN_INT (max_bytes),
-NULL_RTX, library_call_label);
+NULL_RTX, library_call_label, profile_probability::even ());

   /* Check for < loop_bytes bytes.  */
   do_ifelse (CCmode, LT, bytes_rtx, GEN_INT (loop_bytes),
-NULL_RTX, cleanup_label);
+NULL_RTX, cleanup_label, profile_probability::even ());

   /* Loop compare bytes and iterations if bytes>max_bytes.  */
   rtx mb_reg = gen_reg_rtx (word_mode);
@@ -1165,7 +1170,7 @@
{
  rtx lab_after = gen_label_rtx ();
  do_ifelse (CCmode, LE, bytes_rtx, GEN_INT (max_bytes),
-NULL_RTX, lab_after);
+NULL_RTX, lab_after, profile_probability::even ());
  emit_move_insn (loop_cmp, mb_reg);
  emit_move_insn (iter, mi_reg);
  emit_label (lab_after);
@@ -1236,7 +1241,7 @@
}

   do_ifelse (GET_MODE (dcond), NE, NULL_RTX, NULL_RTX,
-dcond, diff_label);
+dcond, diff_label, profile_probability::unlikely ());

   if (TARGET_P9_MISC)
{
@@ -1260,6 +1265,7 @@
   else
j = emit_jump_insn (gen_bdnztf_si (loop_top_label, ctr, ctr,
   eqrtx, dcond));
+  add_reg_br_prob_note (j, profile_probability::likely ());
   JUMP_LABEL (j) = loop_top_label;
   LABEL_NUSES (loop_top_label) += 1;
 }
@@ -1272,9 +1278,11 @@
  code.  If we exit here with a nonzero diff, it is
  because the second word differed.  */
   if (TARGET_P9_MISC)
-do_ifelse (CCUNSmode, NE, NULL_RTX, NUL

[PATCH, rs6000] PR target/89112 [8/9 Regression] fix bdnzt pattern for long branch case

2019-02-02 Thread Aaron Sawdey
I needed to introduce a local label in this pattern because output_cbranch put 
out a second instruction
in the long branch case. This fixes the issue but there are a couple ways this 
could be improved:

* output_cbranch() is passed the original insn and assumes from that that the 
branch is a long
branch. However this is incorrect because we are just branching to a local 
label we know is only
a few instructions away. If there is a way to fix this, an unnecessary branch 
could be eliminated.

* While the long branch case of this pattern needs to work, the real problem is 
that part of
the code emitted by the memcmp expansion is being treated as cold code and 
moved to the end of
the function. Ideally all of this code should stay together. I suspect I need 
to make some kind
of branch frequency notation for this to happen.

Regstrap passes on ppc64le power7/8/9, ok for trunk and backport to 8?

Thanks!

2019-02-02  Aaron Sawdey  

* config/rs6000/rs6000.md (tf_): generate a local label
for the long branch case.

Index: gcc/config/rs6000/rs6000.md
===
--- gcc/config/rs6000/rs6000.md (revision 268403)
+++ gcc/config/rs6000/rs6000.md (working copy)
@@ -12639,8 +12639,8 @@
   else
 {
   static char seq[96];
-  char *bcs = output_cbranch (operands[3], "$+8", 1, insn);
-  sprintf(seq, " $+12\;%s;b %%l0", bcs);
+  char *bcs = output_cbranch (operands[3], ".L%=", 1, insn);
+  sprintf(seq, " .L%%=\;%s\;b %%l0\;.L%%=:", bcs);
   return seq;
 }
 }



-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



Re: [PATCH][rs6000] avoid using unaligned vsx or lxvd2x/stxvd2x for memcpy/memmove inline expansion

2019-01-14 Thread Aaron Sawdey
The patch for this was committed to trunk as 267562 (see below). Is this also 
ok for backport to 8?

Thanks,
   Aaron

On 12/20/18 5:44 PM, Segher Boessenkool wrote:
> On Thu, Dec 20, 2018 at 05:34:54PM -0600, Aaron Sawdey wrote:
>> On 12/20/18 3:51 AM, Segher Boessenkool wrote:
>>> On Wed, Dec 19, 2018 at 01:53:05PM -0600, Aaron Sawdey wrote:
>>>> Because of POWER9 dd2.1 issues with certain unaligned vsx instructions
>>>> to cache inhibited memory, here is a patch that keeps memmove (and memcpy)
>>>> inline expansion from doing unaligned vector or using vector load/store
>>>> other than lvx/stvx. More description of the issue is here:
>>>>
>>>> https://patchwork.ozlabs.org/patch/814059/
>>>>
>>>> OK for trunk if bootstrap/regtest ok?
>>>
>>> Okay, but see below.
>>>
>> [snip]
>>>
>>> This is extraordinarily clumsy :-)  Maybe something like:
>>>
>>> static rtx
>>> gen_lvx_v4si_move (rtx dest, rtx src)
>>> {
>>>   gcc_assert (!(MEM_P (dest) && MEM_P (src));
>>>   gcc_assert (GET_MODE (dest) == V4SImode && GET_MODE (src) == V4SImode);
>>>   if (MEM_P (dest))
>>> return gen_altivec_stvx_v4si_internal (dest, src);
>>>   else if (MEM_P (src))
>>> return gen_altivec_lvx_v4si_internal (dest, src);
>>>   else
>>> gcc_unreachable ();
>>> }
>>>
>>> (Or do you allow VOIDmode for src as well?)  Anyway, at least get rid of
>>> the useless extra variable.
>>
>> I think this should be better:
> 
> The gcc_unreachable at the end catches the non-mem to non-mem case.
> 
>> static rtx
>> gen_lvx_v4si_move (rtx dest, rtx src)
>> {
>>   gcc_assert ((MEM_P (dest) && !MEM_P (src)) || (MEM_P (src) && 
>> !MEM_P(dest)));
> 
> But if you prefer this, how about
> 
> {
>   gcc_assert (MEM_P (dest) ^ MEM_P (src));
>   gcc_assert (GET_MODE (dest) == V4SImode && GET_MODE (src) == V4SImode);
> 
>   if (MEM_P (dest))
> return gen_altivec_stvx_v4si_internal (dest, src);
>   else
> return gen_altivec_lvx_v4si_internal (dest, src);
> }
> 
> :-)
> 
> 
> Segher
> 

2019-01-03  Aaron Sawdey  

* config/rs6000/rs6000-string.c (expand_block_move): Don't use
unaligned vsx and avoid lxvd2x/stxvd2x.
(gen_lvx_v4si_move): New function.


Index: gcc/config/rs6000/rs6000-string.c
===
--- gcc/config/rs6000/rs6000-string.c   (revision 267299)
+++ gcc/config/rs6000/rs6000-string.c   (working copy)
@@ -2669,6 +2669,25 @@
   return true;
 }

+/* Generate loads and stores for a move of v4si mode using lvx/stvx.
+   This uses altivec_{l,st}vx__internal which use unspecs to
+   keep combine from changing what instruction gets used.
+
+   DEST is the destination for the data.
+   SRC is the source of the data for the move.  */
+
+static rtx
+gen_lvx_v4si_move (rtx dest, rtx src)
+{
+  gcc_assert (MEM_P (dest) ^ MEM_P (src));
+  gcc_assert (GET_MODE (dest) == V4SImode && GET_MODE (src) == V4SImode);
+
+  if (MEM_P (dest))
+return gen_altivec_stvx_v4si_internal (dest, src);
+  else
+return gen_altivec_lvx_v4si_internal (dest, src);
+}
+
 /* Expand a block move operation, and return 1 if successful.  Return 0
if we should let the compiler generate normal code.

@@ -2721,11 +2740,11 @@

   /* Altivec first, since it will be faster than a string move
 when it applies, and usually not significantly larger.  */
-  if (TARGET_ALTIVEC && bytes >= 16 && (TARGET_EFFICIENT_UNALIGNED_VSX || 
align >= 128))
+  if (TARGET_ALTIVEC && bytes >= 16 && align >= 128)
{
  move_bytes = 16;
  mode = V4SImode;
- gen_func.mov = gen_movv4si;
+ gen_func.mov = gen_lvx_v4si_move;
}
   else if (bytes >= 8 && TARGET_POWERPC64
   && (align >= 64 || !STRICT_ALIGNMENT))



-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



Re: [PATCH][rs6000] avoid using unaligned vsx or lxvd2x/stxvd2x for memcpy/memmove inline expansion

2018-12-20 Thread Aaron Sawdey
On 12/20/18 5:44 PM, Segher Boessenkool wrote:
> On Thu, Dec 20, 2018 at 05:34:54PM -0600, Aaron Sawdey wrote:
>> On 12/20/18 3:51 AM, Segher Boessenkool wrote:
>>> On Wed, Dec 19, 2018 at 01:53:05PM -0600, Aaron Sawdey wrote:
>>>> Because of POWER9 dd2.1 issues with certain unaligned vsx instructions
>>>> to cache inhibited memory, here is a patch that keeps memmove (and memcpy)
>>>> inline expansion from doing unaligned vector or using vector load/store
>>>> other than lvx/stvx. More description of the issue is here:
>>>>
>>>> https://patchwork.ozlabs.org/patch/814059/
>>>>
>>>> OK for trunk if bootstrap/regtest ok?
>>>
>>> Okay, but see below.
>>>
>> [snip]
>>>
>>> This is extraordinarily clumsy :-)  Maybe something like:
>>>
>>> static rtx
>>> gen_lvx_v4si_move (rtx dest, rtx src)
>>> {
>>>   gcc_assert (!(MEM_P (dest) && MEM_P (src));
>>>   gcc_assert (GET_MODE (dest) == V4SImode && GET_MODE (src) == V4SImode);
>>>   if (MEM_P (dest))
>>> return gen_altivec_stvx_v4si_internal (dest, src);
>>>   else if (MEM_P (src))
>>> return gen_altivec_lvx_v4si_internal (dest, src);
>>>   else
>>> gcc_unreachable ();
>>> }
>>>
>>> (Or do you allow VOIDmode for src as well?)  Anyway, at least get rid of
>>> the useless extra variable.
>>
>> I think this should be better:
> 
> The gcc_unreachable at the end catches the non-mem to non-mem case.
> 
>> static rtx
>> gen_lvx_v4si_move (rtx dest, rtx src)
>> {
>>   gcc_assert ((MEM_P (dest) && !MEM_P (src)) || (MEM_P (src) && 
>> !MEM_P(dest)));
> 
> But if you prefer this, how about
> 
> {
>   gcc_assert (MEM_P (dest) ^ MEM_P (src));
>   gcc_assert (GET_MODE (dest) == V4SImode && GET_MODE (src) == V4SImode);
> 
>   if (MEM_P (dest))
> return gen_altivec_stvx_v4si_internal (dest, src);
>   else
> return gen_altivec_lvx_v4si_internal (dest, src);
> }
> 
> :-)
> 
> 
> Segher
> 

I like that even better, thanks!

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



Re: [PATCH][rs6000] avoid using unaligned vsx or lxvd2x/stxvd2x for memcpy/memmove inline expansion

2018-12-20 Thread Aaron Sawdey
On 12/20/18 3:51 AM, Segher Boessenkool wrote:
> On Wed, Dec 19, 2018 at 01:53:05PM -0600, Aaron Sawdey wrote:
>> Because of POWER9 dd2.1 issues with certain unaligned vsx instructions
>> to cache inhibited memory, here is a patch that keeps memmove (and memcpy)
>> inline expansion from doing unaligned vector or using vector load/store
>> other than lvx/stvx. More description of the issue is here:
>>
>> https://patchwork.ozlabs.org/patch/814059/
>>
>> OK for trunk if bootstrap/regtest ok?
> 
> Okay, but see below.
> 
[snip]
> 
> This is extraordinarily clumsy :-)  Maybe something like:
> 
> static rtx
> gen_lvx_v4si_move (rtx dest, rtx src)
> {
>   gcc_assert (!(MEM_P (dest) && MEM_P (src));
>   gcc_assert (GET_MODE (dest) == V4SImode && GET_MODE (src) == V4SImode);
>   if (MEM_P (dest))
> return gen_altivec_stvx_v4si_internal (dest, src);
>   else if (MEM_P (src))
> return gen_altivec_lvx_v4si_internal (dest, src);
>   else
> gcc_unreachable ();
> }
> 
> (Or do you allow VOIDmode for src as well?)  Anyway, at least get rid of
> the useless extra variable.

I think this should be better:

static rtx
gen_lvx_v4si_move (rtx dest, rtx src)
{
  gcc_assert ((MEM_P (dest) && !MEM_P (src)) || (MEM_P (src) && !MEM_P(dest)));
  gcc_assert (GET_MODE (dest) == V4SImode && GET_MODE (src) == V4SImode);
  if (MEM_P (dest))
  return gen_altivec_stvx_v4si_internal (dest, src);
  else if (MEM_P (src))
  return gen_altivec_lvx_v4si_internal (dest, src);
  gcc_unreachable ();
}

I'll commit after I re-regstrap.

Thanks!
   Aaron

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



Re: [PATCH] -Wtautological-compare: fix comparison of macro expansions

2018-12-20 Thread Aaron Sawdey
On 12/20/18 8:25 AM, David Malcolm wrote:
> According to comments within PR c++/87504, the patch fixes the
> bootstrap on aarch64, and fixes a similar issue on Solaris/SPARC.
> 
> It also passed bootstrap on x86_64-pc-linux-gnu.
> 
> Given that, I've committed it to trunk as r267299.
> 
> Aaron, does this fix the issue you saw?
> 
> Thanks, and sorry again about the breakage.
> Dave
> 

Dave,
  Thanks for the quick response, the build issue is fixed with r267299.

  Aaron

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



Re: [PATCH 2/2] v2: C++: improvements to binary operator diagnostics (PR c++/87504)

2018-12-19 Thread Aaron Sawdey
; (build_binary_op): Use struct op_location_t and
>> class binary_op_rich_location.
>>
>> gcc/cp/ChangeLog:
>> PR c++/87504
>> * call.c (op_error): Convert 1st param from location_t to
>> const op_location_t &.  Use binary_op_rich_location for binary
>> ops.
>> (build_conditional_expr_1): Convert 1st param from location_t to
>> const op_location_t &.
>> (build_conditional_expr): Likewise.
>> (build_new_op_1): Likewise.
>> (build_new_op): Likewise.
>> * cp-tree.h (build_conditional_expr): Likewise.
>> (build_new_op): Likewise.
>> (build_x_binary_op): Likewise.
>> (cp_build_binary_op): Likewise.
>> * parser.c (cp_parser_primary_expression): Build a location
>> for id-expression nodes.
>> (cp_parser_binary_expression): Use an op_location_t when
>> calling build_x_binary_op.
>> (cp_parser_operator): Build a location for user-defined literals.
>> * typeck.c (build_x_binary_op): Convert 1st param from location_t
>> to const op_location_t &.
>> (cp_build_binary_op): Likewise.  Use binary_op_rich_location.
>>
>> gcc/ChangeLog:
>> PR c++/87504
>> * gcc-rich-location.c
>> (maybe_range_label_for_tree_type_mismatch::get_text): Move here from
>> c/c-typeck.c.
>> (binary_op_rich_location::binary_op_rich_location): New ctor.
>> (binary_op_rich_location::use_operator_loc_p): New function.
>> * gcc-rich-location.h
>> (class maybe_range_label_for_tree_type_mismatch)): Move here from
>> c/c-typeck.c.
>> (struct op_location_t): New forward decl.
>> (class binary_op_rich_location): New class.
>> * tree.h (struct op_location_t): New struct.
>>
>> gcc/testsuite/ChangeLog:
>> * c-c++-common/Wtautological-compare-ranges.c: New test.
>> * g++.dg/cpp0x/pr51420.C: Add -fdiagnostics-show-caret and update
>> expected output.
>> * g++.dg/diagnostic/bad-binary-ops.C: Update expected output from
>> 1-location form to 3-location form, with labelling of ranges with
>> types.  Add examples of id-expression nodes with namespaces.
>> * g++.dg/diagnostic/param-type-mismatch-2.C: Likewise.
>>
>> This is the 2nd commit message:
>>
>> FIXME: column and multiline fixes to * g++.dg/cpp0x/pr51420.C
>> ---
>>   gcc/c-family/c-common.h    |  3 +-
>>   gcc/c-family/c-warn.c  | 57 +++---
>>   gcc/c/c-typeck.c   | 41 +-
>>   gcc/cp/call.c  | 28 ---
>>   gcc/cp/cp-tree.h   | 10 ++-
>>   gcc/cp/parser.c    | 32 ++--
>>   gcc/cp/typeck.c    | 14 ++--
>>   gcc/gcc-rich-location.c    | 89 
>> ++
>>   gcc/gcc-rich-location.h    | 57 ++
>>   .../c-c++-common/Wtautological-compare-ranges.c    | 42 ++
>>   gcc/testsuite/g++.dg/cpp0x/pr51420.C   | 10 +++
>>   gcc/testsuite/g++.dg/diagnostic/bad-binary-ops.C   | 57 +-
>>   .../g++.dg/diagnostic/param-type-mismatch-2.C  |  4 +-
>>   gcc/tree.h | 49 
>>   14 files changed, 417 insertions(+), 76 deletions(-)
>>   create mode 100644 
>> gcc/testsuite/c-c++-common/Wtautological-compare-ranges.c
>>
>> diff --git a/gcc/c-family/c-common.h b/gcc/c-family/c-common.h
>> index 4187343..0b9ddf6 100644
>> --- a/gcc/c-family/c-common.h
>> +++ b/gcc/c-family/c-common.h
>> @@ -1268,7 +1268,8 @@ extern void constant_expression_error (tree);
>>   extern void overflow_warning (location_t, tree, tree = NULL_TREE);
>>   extern void warn_logical_operator (location_t, enum tree_code, tree,
>>  enum tree_code, tree, enum tree_code, tree);
>> -extern void warn_tautological_cmp (location_t, enum tree_code, tree, tree);
>> +extern void warn_tautological_cmp (const op_location_t &, enum tree_code,
>> +   tree, tree);
>>   extern void warn_logical_not_parentheses (location_t, enum tree_code, tree,
>>     tree);
>>   extern bool warn_if_unused_value (const_tree, location_t);
>> diff --git a/gcc/c-family/c-warn.c b/gcc/c-family/c-warn.c
>> index fc7f87c..fce9d84 100644
>> --- a/gcc/c-family/c-warn.c
>> +++ b/gcc/c-family/c-warn.c
>> @@ -322,7 +322,8 @@ find_array_ref_with_const_idx_r (tree *expr_p, int *, 
>> void *)
>>       if ((TREE_CODE (expr) == ARRAY_REF
>>  || TREE_CODE (expr) == ARRAY_RANGE_REF)
>> -  && TREE_CODE (TREE_OPERAND (expr, 1)) == INTEGER_CST)
>> +  && (TREE_CODE (tree_strip_any_location_wrapper (TREE_OPERAND (expr, 
>> 1)))
>> +  == INTEGER_CST))
>>   return integer_type_node;
> 
> I think we want fold_for_warn here.  OK with that change (assuming it passes).
> 
> Jason
> 

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



[PATCH][rs6000] avoid using unaligned vsx or lxvd2x/stxvd2x for memcpy/memmove inline expansion

2018-12-19 Thread Aaron Sawdey
Because of POWER9 dd2.1 issues with certain unaligned vsx instructions
to cache inhibited memory, here is a patch that keeps memmove (and memcpy)
inline expansion from doing unaligned vector or using vector load/store
other than lvx/stvx. More description of the issue is here:

https://patchwork.ozlabs.org/patch/814059/

OK for trunk if bootstrap/regtest ok?

Thanks!
   Aaron

2018-12-19  Aaron Sawdey  

* config/rs6000/rs6000-string.c (expand_block_move): Don't use
unaligned vsx and avoid lxvd2x/stxvd2x.
(gen_lvx_v4si_move): New function.


Index: gcc/config/rs6000/rs6000-string.c
===
--- gcc/config/rs6000/rs6000-string.c   (revision 267055)
+++ gcc/config/rs6000/rs6000-string.c   (working copy)
@@ -2669,6 +2669,35 @@
   return true;
 }

+/* Generate loads and stores for a move of v4si mode using lvx/stvx.
+   This uses altivec_{l,st}vx__internal which use unspecs to
+   keep combine from changing what instruction gets used.
+
+   DEST is the destination for the data.
+   SRC is the source of the data for the move.  */
+
+static rtx
+gen_lvx_v4si_move (rtx dest, rtx src)
+{
+  rtx rv = NULL;
+  if (MEM_P (dest))
+{
+  gcc_assert (!MEM_P (src));
+  gcc_assert (GET_MODE (src) == V4SImode);
+  rv = gen_altivec_stvx_v4si_internal (dest, src);
+}
+  else if (MEM_P (src))
+{
+  gcc_assert (!MEM_P (dest));
+  gcc_assert (GET_MODE (dest) == V4SImode);
+  rv = gen_altivec_lvx_v4si_internal (dest, src);
+}
+  else
+gcc_unreachable ();
+
+  return rv;
+}
+
 /* Expand a block move operation, and return 1 if successful.  Return 0
if we should let the compiler generate normal code.

@@ -2721,11 +2750,11 @@

   /* Altivec first, since it will be faster than a string move
 when it applies, and usually not significantly larger.  */
-  if (TARGET_ALTIVEC && bytes >= 16 && (TARGET_EFFICIENT_UNALIGNED_VSX || 
align >= 128))
+  if (TARGET_ALTIVEC && bytes >= 16 && align >= 128)
{
  move_bytes = 16;
  mode = V4SImode;
- gen_func.mov = gen_movv4si;
+ gen_func.mov = gen_lvx_v4si_move;
}
   else if (bytes >= 8 && TARGET_POWERPC64
       && (align >= 64 || !STRICT_ALIGNMENT))



-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



Re: [PATCH][rs6000] better use of unaligned vsx in memset() expansion

2018-11-28 Thread Aaron Sawdey
The first version of this had a big bug and cleared past the requested bytes.
This version passes regstrap on ppc64le(power7/8/9), ppc64be(power6/7/8),
and ppc32(power8).

OK for trunk (and 8 backport after a week)?

Thanks!
   Aaron

Index: gcc/config/rs6000/rs6000-string.c
===
--- gcc/config/rs6000/rs6000-string.c   (revision 266524)
+++ gcc/config/rs6000/rs6000-string.c   (working copy)
@@ -85,6 +85,8 @@
   if (! optimize_size && bytes > 8 * clear_step)
 return 0;

+  bool unaligned_vsx_ok = (bytes >= 32 && TARGET_EFFICIENT_UNALIGNED_VSX);
+
   for (offset = 0; bytes > 0; offset += clear_bytes, bytes -= clear_bytes)
 {
   machine_mode mode = BLKmode;
@@ -91,8 +93,7 @@
   rtx dest;

   if (TARGET_ALTIVEC
- && ((bytes >= 16 && align >= 128)
- || (bytes >= 32 && TARGET_EFFICIENT_UNALIGNED_VSX)))
+ && (bytes >= 16 && ( align >= 128 || unaligned_vsx_ok)))
{
  clear_bytes = 16;
  mode = V4SImode;


On 11/26/18 4:29 PM, Segher Boessenkool wrote:
> On Mon, Nov 26, 2018 at 03:08:32PM -0600, Aaron Sawdey wrote:
>> When I previously added the use of unaligned vsx stores to inline expansion
>> of memset, I didn't do a good job of managing boundary conditions. The 
>> intention
>> was to only use unaligned vsx if the block being cleared was more than 32 
>> bytes.
>> What it actually did was to prevent the use of unaligned vsx for the last 32
>> bytes of any block being cleared. So this change puts the test up front so it
>> is not affected by the decrement of bytes.
> 
> Oh wow.  Yes, that isn't so great.  Okay for trunk (and whatever backports).
> Thanks,
> 
> 
> Segher
> 
> 
>> 2018-11-26  Aaron Sawdey  
>>
>>  * config/rs6000/rs6000-string.c (expand_block_clear): Change how
>>  we determine if unaligned vsx is ok.
> 

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



Re: [PATCH][rs6000][8 backport] improve gpr inline expansion of str[n]cmp

2018-11-26 Thread Aaron Sawdey
Just so there is some record of what I did here -- in order to backport the
gpr strncmp expansion improvement patch to gcc 8 I had to pull in some pieces
of an earlier cleanup patch from June of this year.

I'll get this committed to gcc-8-branch when I'm done with the bootstrap/regtest
on a couple different ppc64 architectures (unless anyone has any objections).

Thanks,
   Aaron



2018-11-26  Aaron Sawdey  

Backport from mainline
2018-10-25  Aaron Sawdey  

* config/rs6000/rs6000-string.c (expand_strncmp_gpr_sequence): Change to
a shorter sequence with fewer branches.
(emit_final_str_compare_gpr): Ditto.

Backport from mainline to allow the above code to go in:
2018-06-14  Aaron Sawdey  

* config/rs6000/rs6000-string.c (do_and3, do_and3_mask,


Index: rs6000-string.c
===
--- rs6000-string.c (revision 266483)
+++ rs6000-string.c (working copy)
@@ -408,6 +408,54 @@
 emit_insn (gen_addsi3 (dest, src1, src2));
 }

+/* Emit an and of the proper mode for DEST.
+
+   DEST is the destination register for the and.
+   SRC1 is the first and input.
+   SRC2 is the second and input.
+
+   Computes DEST = SRC1  */
+static void
+do_and3 (rtx dest, rtx src1, rtx src2)
+{
+  if (GET_MODE (dest) == DImode)
+emit_insn (gen_anddi3 (dest, src1, src2));
+  else
+emit_insn (gen_andsi3 (dest, src1, src2));
+}
+
+/* Emit an cmpb of the proper mode for DEST.
+
+   DEST is the destination register for the cmpb.
+   SRC1 is the first input.
+   SRC2 is the second input.
+
+   Computes cmpb of SRC1, SRC2.  */
+static void
+do_cmpb3 (rtx dest, rtx src1, rtx src2)
+{
+  if (GET_MODE (dest) == DImode)
+emit_insn (gen_cmpbdi3 (dest, src1, src2));
+  else
+emit_insn (gen_cmpbsi3 (dest, src1, src2));
+}
+
+/* Emit a rotl of the proper mode for DEST.
+
+   DEST is the destination register for the and.
+   SRC1 is the first and input.
+   SRC2 is the second and input.
+
+   Computes DEST = SRC1 rotated left by SRC2.  */
+static void
+do_rotl3 (rtx dest, rtx src1, rtx src2)
+{
+  if (GET_MODE (dest) == DImode)
+emit_insn (gen_rotldi3 (dest, src1, src2));
+  else
+emit_insn (gen_rotlsi3 (dest, src1, src2));
+}
+
 /* Generate rtl for a load, shift, and compare of less than a full word.

LOAD_MODE is the machine mode for the loads.
@@ -640,7 +688,7 @@
 {
   if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) > GET_MODE_SIZE (word_mode))
/* Do not expect length longer than word_mode.  */
-   return false;
+   return false;
   else if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) < GET_MODE_SIZE 
(word_mode))
{
  bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx);
@@ -684,7 +732,7 @@
   rtx j;

   /* Example of generated code for 35 bytes aligned 1 byte.
-
+
 mtctr 8
 li 6,0
 li 5,8
@@ -712,7 +760,7 @@
 popcntd 9,9
 subfe 10,10,10
 or 9,9,10
-
+
  Compiled with -fno-reorder-blocks for clarity.  */

   /* Structure of what we're going to do:
@@ -955,7 +1003,7 @@
   if (!bytes_is_const)
{
  /* If we're dealing with runtime length, we have to check if
-it's zero after the loop. When length is known at compile
+it's zero after the loop.  When length is known at compile
 time the no-remainder condition is dealt with above.  By
 doing this after cleanup_label, we also deal with the
 case where length is 0 at the start and we bypass the
@@ -1325,7 +1373,7 @@
   rtx tmp_reg_src1 = gen_reg_rtx (word_mode);
   rtx tmp_reg_src2 = gen_reg_rtx (word_mode);
   /* P7/P8 code uses cond for subfc. but P9 uses
- it for cmpld which needs CCUNSmode. */
+ it for cmpld which needs CCUNSmode.  */
   rtx cond;
   if (TARGET_P9_MISC)
 cond = gen_reg_rtx (CCUNSmode);
@@ -1578,7 +1626,7 @@
emit_label (convert_label);

   /* We need to produce DI result from sub, then convert to target SI
-while maintaining <0 / ==0 / >0 properties. This sequence works:
+while maintaining <0 / ==0 / >0 properties.  This sequence works:
 subfc L,A,B
 subfe H,H,H
 popcntd L,L
@@ -1847,6 +1895,9 @@
   rtx tmp_reg_src1 = gen_reg_rtx (word_mode);
   rtx tmp_reg_src2 = gen_reg_rtx (word_mode);

+  rtx src1_addr = force_reg (Pmode, XEXP (orig_src1, 0));
+  rtx src2_addr = force_reg (Pmode, XEXP (orig_src2, 0));
+
   /* Generate sequence of ld/ldbrx, cmpb to compare out
  to the length specified.  */
   unsigned HOST_WIDE_INT bytes_to_compare = compare_length;
@@ -1853,12 +1904,9 @@
   while (bytes_to_compare > 0)
 {
   /* Compare sequence:
- check each 8B with: ld/ld cmpd bne
-If equal, use rldicr/cmpb to check for zero byte.
+ check each 8B with: ld/ld/cmpb/cmpb/orc./bne
+
  cleanup code at end:
- c

[PATCH][rs6000] better use of unaligned vsx in memset() expansion

2018-11-26 Thread Aaron Sawdey
When I previously added the use of unaligned vsx stores to inline expansion
of memset, I didn't do a good job of managing boundary conditions. The intention
was to only use unaligned vsx if the block being cleared was more than 32 bytes.
What it actually did was to prevent the use of unaligned vsx for the last 32
bytes of any block being cleared. So this change puts the test up front so it
is not affected by the decrement of bytes.

OK for trunk if regstrap passes?

Thanks!
   Aaron



2018-11-26  Aaron Sawdey  

* config/rs6000/rs6000-string.c (expand_block_clear): Change how
we determine if unaligned vsx is ok.


Index: gcc/config/rs6000/rs6000-string.c
===
--- gcc/config/rs6000/rs6000-string.c   (revision 266219)
+++ gcc/config/rs6000/rs6000-string.c   (working copy)
@@ -85,14 +85,14 @@
   if (! optimize_size && bytes > 8 * clear_step)
 return 0;

+  bool unaligned_vsx_ok = (bytes >= 32 && TARGET_EFFICIENT_UNALIGNED_VSX);
+
   for (offset = 0; bytes > 0; offset += clear_bytes, bytes -= clear_bytes)
 {
   machine_mode mode = BLKmode;
   rtx dest;

-  if (TARGET_ALTIVEC
- && ((bytes >= 16 && align >= 128)
- || (bytes >= 32 && TARGET_EFFICIENT_UNALIGNED_VSX)))
+  if (TARGET_ALTIVEC && ((bytes >= 16 && align >= 128) || 
unaligned_vsx_ok))
    {
  clear_bytes = 16;
  mode = V4SImode;

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



Re: [PATCH][rs6000] inline expansion of memcmp using vsx

2018-11-15 Thread Aaron Sawdey
On 11/15/18 4:02 AM, Richard Biener wrote:
> On Wed, Nov 14, 2018 at 5:43 PM Aaron Sawdey  wrote:
>>
>> This patch generalizes some the functions added earlier to do vsx expansion 
>> of strncmp
>> so that the can also generate the code needed for memcmp. I reorganized
>> expand_block_compare() a little to be able to make use of this there. The 
>> vsx code is more
>> compact so I've changed the default block compare inline limit to 63 bytes. 
>> The vsx
>> code is only used if there is at least 16 bytes to compare as this means we 
>> don't have to
>> do complex code to compare less than one chunk. If vsx is not available the 
>> limit is cut
>> in half. The performance is good, vsx memcmp is considerably faster than the 
>> gpr inline code
>> if the strings are equal and is comparable if the strings have a 10% chance 
>> of being
>> equal (spread across the string).
> 
> How is performance affected if there are close earlier char-size
> stores to one of the string/memory?
> Can power still do store forwarding in this case?

Store forwarding between scalar and vector is not great, but it's
better than having to make a plt call to memcmp() which may well use
vsx anyway. I had set the crossover between scalar and vsx at 16 bytes
because the vsx code is more compact. The performance is similar for
16-32 byte sizes. But you could make an argument for switching at 33
bytes. This way builtin memcmp of 33-64 bytes would now use inline vsx
code instead of memcmp() call. At 33 bytes the vsx inline code is 3x
faster than a memcmp() call so would likely remain faster even if
there was an ugly vector-load-hit-scalar-store. Also small structures
32 bytes and less being compared would use scalar code and the same as
gcc 8 and would avoid this issue.

  Aaron

> 
>> Currently regtesting, ok for trunk if tests pass?
>>
>> Thanks!
>>Aaron
>>
>> 2018-11-14  Aaron Sawdey  
>>
>> * config/rs6000/rs6000-string.c (emit_vsx_zero_reg): New function.
>> (expand_cmp_vec_sequence): Rename and modify
>> expand_strncmp_vec_sequence.
>> (emit_final_compare_vec): Rename and modify 
>> emit_final_str_compare_vec.
>> (generate_6432_conversion): New function.
>> (expand_block_compare): Add support for vsx.
>> (expand_block_compare_gpr): New function.
>> * config/rs6000/rs6000.opt (rs6000_block_compare_inline_limit): 
>> Increase
>> default limit to 63 because of more compact vsx code.
>>
>>
>>
>>
>> Index: gcc/config/rs6000/rs6000-string.c
>> ===
>> --- gcc/config/rs6000/rs6000-string.c   (revision 266034)
>> +++ gcc/config/rs6000/rs6000-string.c   (working copy)
>> @@ -615,6 +615,283 @@
>>  }
>>  }
>>
>> +static rtx
>> +emit_vsx_zero_reg()
>> +{
>> +  unsigned int i;
>> +  rtx zr[16];
>> +  for (i = 0; i < 16; i++)
>> +zr[i] = GEN_INT (0);
>> +  rtvec zv = gen_rtvec_v (16, zr);
>> +  rtx zero_reg = gen_reg_rtx (V16QImode);
>> +  rs6000_expand_vector_init (zero_reg, gen_rtx_PARALLEL (V16QImode, zv));
>> +  return zero_reg;
>> +}
>> +
>> +/* Generate the sequence of compares for strcmp/strncmp using vec/vsx
>> +   instructions.
>> +
>> +   BYTES_TO_COMPARE is the number of bytes to be compared.
>> +   ORIG_SRC1 is the unmodified rtx for the first string.
>> +   ORIG_SRC2 is the unmodified rtx for the second string.
>> +   S1ADDR is the register to use for the base address of the first string.
>> +   S2ADDR is the register to use for the base address of the second string.
>> +   OFF_REG is the register to use for the string offset for loads.
>> +   S1DATA is the register for loading the first string.
>> +   S2DATA is the register for loading the second string.
>> +   VEC_RESULT is the rtx for the vector result indicating the byte 
>> difference.
>> +   EQUALITY_COMPARE_REST is a flag to indicate we need to make a cleanup 
>> call
>> +   to strcmp/strncmp if we have equality at the end of the inline 
>> comparison.
>> +   P_CLEANUP_LABEL is a pointer to rtx for a label we generate if we need 
>> code
>> +   to clean up and generate the final comparison result.
>> +   FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just
>> +   set the final result.
>> +   CHECKZERO indicates whether the sequence should check for zero bytes
>> +   for use doing strncmp, or not (for use doing memcmp).  */
>> +static void
>> +expand_cmp_vec_sequence (unsigned HOST_WIDE_I

[PATCH][rs6000] inline expansion of memcmp using vsx

2018-11-14 Thread Aaron Sawdey
This patch generalizes some the functions added earlier to do vsx expansion of 
strncmp
so that the can also generate the code needed for memcmp. I reorganized
expand_block_compare() a little to be able to make use of this there. The vsx 
code is more
compact so I've changed the default block compare inline limit to 63 bytes. The 
vsx
code is only used if there is at least 16 bytes to compare as this means we 
don't have to
do complex code to compare less than one chunk. If vsx is not available the 
limit is cut
in half. The performance is good, vsx memcmp is considerably faster than the 
gpr inline code
if the strings are equal and is comparable if the strings have a 10% chance of 
being
equal (spread across the string).

Currently regtesting, ok for trunk if tests pass?

Thanks!
   Aaron

2018-11-14  Aaron Sawdey  

* config/rs6000/rs6000-string.c (emit_vsx_zero_reg): New function.
(expand_cmp_vec_sequence): Rename and modify
expand_strncmp_vec_sequence.
(emit_final_compare_vec): Rename and modify emit_final_str_compare_vec.
(generate_6432_conversion): New function.
(expand_block_compare): Add support for vsx.
(expand_block_compare_gpr): New function.
* config/rs6000/rs6000.opt (rs6000_block_compare_inline_limit): Increase
default limit to 63 because of more compact vsx code.




Index: gcc/config/rs6000/rs6000-string.c
===
--- gcc/config/rs6000/rs6000-string.c   (revision 266034)
+++ gcc/config/rs6000/rs6000-string.c   (working copy)
@@ -615,6 +615,283 @@
 }
 }

+static rtx
+emit_vsx_zero_reg()
+{
+  unsigned int i;
+  rtx zr[16];
+  for (i = 0; i < 16; i++)
+zr[i] = GEN_INT (0);
+  rtvec zv = gen_rtvec_v (16, zr);
+  rtx zero_reg = gen_reg_rtx (V16QImode);
+  rs6000_expand_vector_init (zero_reg, gen_rtx_PARALLEL (V16QImode, zv));
+  return zero_reg;
+}
+
+/* Generate the sequence of compares for strcmp/strncmp using vec/vsx
+   instructions.
+
+   BYTES_TO_COMPARE is the number of bytes to be compared.
+   ORIG_SRC1 is the unmodified rtx for the first string.
+   ORIG_SRC2 is the unmodified rtx for the second string.
+   S1ADDR is the register to use for the base address of the first string.
+   S2ADDR is the register to use for the base address of the second string.
+   OFF_REG is the register to use for the string offset for loads.
+   S1DATA is the register for loading the first string.
+   S2DATA is the register for loading the second string.
+   VEC_RESULT is the rtx for the vector result indicating the byte difference.
+   EQUALITY_COMPARE_REST is a flag to indicate we need to make a cleanup call
+   to strcmp/strncmp if we have equality at the end of the inline comparison.
+   P_CLEANUP_LABEL is a pointer to rtx for a label we generate if we need code
+   to clean up and generate the final comparison result.
+   FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just
+   set the final result.
+   CHECKZERO indicates whether the sequence should check for zero bytes
+   for use doing strncmp, or not (for use doing memcmp).  */
+static void
+expand_cmp_vec_sequence (unsigned HOST_WIDE_INT bytes_to_compare,
+rtx orig_src1, rtx orig_src2,
+rtx s1addr, rtx s2addr, rtx off_reg,
+rtx s1data, rtx s2data, rtx vec_result,
+bool equality_compare_rest, rtx *p_cleanup_label,
+rtx final_move_label, bool checkzero)
+{
+  machine_mode load_mode;
+  unsigned int load_mode_size;
+  unsigned HOST_WIDE_INT cmp_bytes = 0;
+  unsigned HOST_WIDE_INT offset = 0;
+  rtx zero_reg = NULL;
+
+  gcc_assert (p_cleanup_label != NULL);
+  rtx cleanup_label = *p_cleanup_label;
+
+  emit_move_insn (s1addr, force_reg (Pmode, XEXP (orig_src1, 0)));
+  emit_move_insn (s2addr, force_reg (Pmode, XEXP (orig_src2, 0)));
+
+  if (checkzero && !TARGET_P9_VECTOR)
+zero_reg = emit_vsx_zero_reg();
+
+  while (bytes_to_compare > 0)
+{
+  /* VEC/VSX compare sequence for P8:
+check each 16B with:
+lxvd2x 32,28,8
+lxvd2x 33,29,8
+vcmpequb 2,0,1  # compare strings
+vcmpequb 4,0,3  # compare w/ 0
+xxlorc 37,36,34   # first FF byte is either mismatch or end of 
string
+vcmpequb. 7,5,3  # reg 7 contains 0
+bnl 6,.Lmismatch
+
+For the P8 LE case, we use lxvd2x and compare full 16 bytes
+but then use use vgbbd and a shift to get two bytes with the
+information we need in the correct order.
+
+VEC/VSX compare sequence if TARGET_P9_VECTOR:
+lxvb16x/lxvb16x # load 16B of each string
+vcmpnezb.   # produces difference location or zero byte 
location
+bne 6,.Lmismatch
+
+Use the overlapping compare trick for the last block if it is
+less than 16 bytes.
+  */
+
+  load_mode = V16QImode;
+  

[PATCH][rs6000] use index form addresses more often for l[wh]brx/st[wh]brx

2018-11-05 Thread Aaron Sawdey
This does the same thing for bswap2 that I previously did for bswapdi2.
The predicates for bswap2_{load,store} are now 
indexed_or_indirect_operand,
and bswap2 uses rs6000_force_indexed_or_indirect_mem to make sure the
address is appropriate for that predicate.

Bootstrap/regtest passes on ppc64le power8/power9, ok for trunk?

Thanks!
Aaron



2018-11-05  Aaron Sawdey  

* config/rs6000/rs6000.md (bswap2): Force address into register
if not in indexed or indirect form.
(bswap2_load): Change predicate to indexed_or_indirect_operand.
(bswap2_store): Ditto.


Index: gcc/config/rs6000/rs6000.md
===
--- gcc/config/rs6000/rs6000.md (revision 265753)
+++ gcc/config/rs6000/rs6000.md (working copy)
@@ -2411,9 +2411,15 @@
 src = force_reg (mode, src);

   if (MEM_P (src))
-emit_insn (gen_bswap2_load (dest, src));
+{
+   src = rs6000_force_indexed_or_indirect_mem (src);
+   emit_insn (gen_bswap2_load (dest, src));
+}
   else if (MEM_P (dest))
-emit_insn (gen_bswap2_store (dest, src));
+{
+   dest = rs6000_force_indexed_or_indirect_mem (dest);
+   emit_insn (gen_bswap2_store (dest, src));
+}
   else
 emit_insn (gen_bswap2_reg (dest, src));
   DONE;
@@ -2421,13 +2427,13 @@

 (define_insn "bswap2_load"
   [(set (match_operand:HSI 0 "gpc_reg_operand" "=r")
-   (bswap:HSI (match_operand:HSI 1 "memory_operand" "Z")))]
+   (bswap:HSI (match_operand:HSI 1 "indexed_or_indirect_operand" "Z")))]
   ""
   "lbrx %0,%y1"
   [(set_attr "type" "load")])

 (define_insn "bswap2_store"
-  [(set (match_operand:HSI 0 "memory_operand" "=Z")
+  [(set (match_operand:HSI 0 "indexed_or_indirect_operand" "=Z")
(bswap:HSI (match_operand:HSI 1 "gpc_reg_operand" "r")))]
   ""
   "stbrx %1,%y0"




-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



[PATCH][rs6000] fix ICE for strncmp expansion on power6

2018-11-02 Thread Aaron Sawdey
This patch addresses an ICE for a missing instruction when targeting power6. 
The issue
is that we shouldn't generate x-form load rtx if TARGET_AVOID_XFORM is true 
because
it won't end up being matched. More generally, on big endian we do not need to 
use
ldbrx et. al. which are index loads, but can just use ld and other normal d-form
loads. So this is going to generate better code for BE in general which is why 
I have
changed it to do this for big endian or TARGET_AVOID_XFORM.

Bootstrap/regtest passes on ppc32 and ppc64 (power 6/7/8), ok for trunk?

Thanks!
   Aaron


2018-11-02  Aaron Sawdey  

* config/rs6000/rs6000-string.c (expand_strncmp_gpr_sequence): Pay
attention to TARGET_AVOID_XFORM.

Index: gcc/config/rs6000/rs6000-string.c
===
--- gcc/config/rs6000/rs6000-string.c   (revision 265733)
+++ gcc/config/rs6000/rs6000-string.c   (working copy)
@@ -1798,12 +1798,18 @@
   rid of the extra bytes.  */
cmp_bytes = bytes_to_compare;

-  rtx offset_reg = gen_reg_rtx (Pmode);
-  emit_move_insn (offset_reg, GEN_INT (offset));
-
-  rtx addr1 = gen_rtx_PLUS (Pmode, src1_addr, offset_reg);
+  rtx offset_rtx;
+  if (BYTES_BIG_ENDIAN || TARGET_AVOID_XFORM)
+   offset_rtx = GEN_INT (offset);
+  else
+   {
+ offset_rtx = gen_reg_rtx (Pmode);
+ emit_move_insn (offset_rtx, GEN_INT (offset));
+   }
+  rtx addr1 = gen_rtx_PLUS (Pmode, src1_addr, offset_rtx);
+  rtx addr2 = gen_rtx_PLUS (Pmode, src2_addr, offset_rtx);
+   
   do_load_for_compare_from_addr (load_mode, tmp_reg_src1, addr1, 
orig_src1);
-  rtx addr2 = gen_rtx_PLUS (Pmode, src2_addr, offset_reg);
   do_load_for_compare_from_addr (load_mode, tmp_reg_src2, addr2, 
orig_src2);

   /* We must always left-align the data we read, and


-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



[PATCH][rs6000] cleanup and rename rs6000_address_for_fpconvert

2018-11-01 Thread Aaron Sawdey
This patch combines the duties of rs6000_address_for_fpconvert into
rs6000_force_indexed_or_indirect_mem which I recently added, and changes
all calls to use the latter. The new function name is more descriptive of what
is actually going on. This now uses indexed_or_indirect_operand() to test
the incoming rtx which matches what the insns this is used to prepare for
are using as their predicate.

Bootstrap/regtest passes on ppc64le (power7, power9), ok for trunk?



2018-11-01  Aaron Sawdey  

* config/rs6000/rs6000-protos.h (rs6000_address_for_fpconvert): Remove
prototype.
* config/rs6000/rs6000.c (rs6000_force_indexed_or_indirect_mem):
Combine with rs6000_address_for_fpconvert.
(rs6000_address_for_fpconvert) Combine with
rs6000_force_indexed_or_indirect_mem.
(rs6000_expand_vector_init): Change function call from
rs6000_address_for_fpconvert to rs6000_force_indexed_or_indirect_mem.
* config/rs6000/rs6000.md (floatsi2_lfiwax): Change call from
rs6000_address_for_fpconvert to rs6000_force_indexed_or_indirect_mem.
(floatsi2_lfiwax_mem): Ditto.
(floatunssi2_lfiwzx): Ditto.
(floatunssi2_lfiwzx_mem): Ditto.
(float2): Ditto.
(floatuns2): Ditto.
(fix_truncsi2_stfiwx): Ditto.
(fixuns_truncsi2_stfiwx): Ditto.
(float_si2_hw): Ditto.
(floatuns_si2_hw): Ditto.
* config/rs6000/vsx.md (*vsx_extract_si): Ditto.
(vsx_splat_): Ditto.




Index: gcc/config/rs6000/rs6000-protos.h
===
--- gcc/config/rs6000/rs6000-protos.h   (revision 265637)
+++ gcc/config/rs6000/rs6000-protos.h   (working copy)
@@ -153,7 +153,6 @@

 extern rtx rs6000_machopic_legitimize_pic_address (rtx, machine_mode,
   rtx);
-extern rtx rs6000_address_for_fpconvert (rtx);
 extern rtx rs6000_allocate_stack_temp (machine_mode, bool, bool);
 extern align_flags rs6000_loop_align (rtx);
 extern void rs6000_split_logical (rtx [], enum rtx_code, bool, bool, bool);
Index: gcc/config/rs6000/rs6000.c
===
--- gcc/config/rs6000/rs6000.c  (revision 265637)
+++ gcc/config/rs6000/rs6000.c  (working copy)
@@ -6560,7 +6560,7 @@
{
  rtx element0 = XVECEXP (vals, 0, 0);
  if (MEM_P (element0))
-   element0 = rs6000_address_for_fpconvert (element0);
+   element0 = rs6000_force_indexed_or_indirect_mem (element0);
  else
element0 = force_reg (SImode, element0);

@@ -6601,7 +6601,7 @@
  if (TARGET_P9_VECTOR)
{
  if (MEM_P (element0))
-   element0 = rs6000_address_for_fpconvert (element0);
+   element0 = rs6000_force_indexed_or_indirect_mem (element0);

  emit_insn (gen_vsx_splat_v4sf (target, element0));
}
@@ -8423,23 +8423,6 @@
   return false;
 }

-/* Helper function for making sure we will make full
-   use of indexed addressing.  */
-
-rtx
-rs6000_force_indexed_or_indirect_mem (rtx x)
-{
-  machine_mode m = GET_MODE (x);
-  if (!indexed_or_indirect_operand (x, m))
-{
-  rtx addr = XEXP (x, 0);
-  addr = force_reg (Pmode, addr);
-  x = replace_equiv_address_nv (x, addr);
-}
-  return x;
-}
-
-
 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook.  */

 static bool
@@ -37312,21 +37295,19 @@
   return stack;
 }

-/* Given a memory reference, if it is not a reg or reg+reg addressing, convert
-   to such a form to deal with memory reference instructions like STFIWX that
-   only take reg+reg addressing.  */
+/* Given a memory reference, if it is not a reg or reg+reg addressing,
+   convert to such a form to deal with memory reference instructions
+   like STFIWX and LDBRX that only take reg+reg addressing.  */

 rtx
-rs6000_address_for_fpconvert (rtx x)
+rs6000_force_indexed_or_indirect_mem (rtx x)
 {
-  rtx addr;
+  machine_mode m = GET_MODE (x);

   gcc_assert (MEM_P (x));
-  addr = XEXP (x, 0);
-  if (can_create_pseudo_p ()
-  && ! legitimate_indirect_address_p (addr, reload_completed)
-  && ! legitimate_indexed_address_p (addr, reload_completed))
+  if (can_create_pseudo_p () && !indexed_or_indirect_operand (x, m))
 {
+  rtx addr = XEXP (x, 0);
   if (GET_CODE (addr) == PRE_INC || GET_CODE (addr) == PRE_DEC)
{
  rtx reg = XEXP (addr, 0);
@@ -37346,7 +37327,7 @@
  addr = reg;
}

-  x = replace_equiv_address (x, copy_addr_to_reg (addr));
+  x = replace_equiv_address (x, force_reg (Pmode, addr));
 }

   return x;
Index: gcc/config/rs6000/rs6000.md
===
--- gcc/config/rs6000/rs6000.md (revision 265637)
+++ gcc/config/rs6000/rs6000.md (working copy)
@@ -5225,7 +5225,7 @@
tmp = gen_reg_rtx (DImode);
   if (MEM_P (src))

Re: [PATCH][rs6000] use index form addresses more often for ldbrx/stdbrx

2018-10-30 Thread Aaron Sawdey
I had to make one more change to make this actually work. In
rs6000_force_indexed_or_indirect_mem() it was necessary to
return the updated rtx.

Bootstrap/regtest passes on ppc64le (power7, power9), ok for trunk?

Thanks!
   Aaron

2018-10-30  Aaron Sawdey  

* config/rs6000/rs6000.md (bswapdi2): Force address into register
if not in indexed or indirect form.
(bswapdi2_load): Change predicate to indexed_or_indirect_operand.
(bswapdi2_store): Ditto.
* config/rs6000/rs6000.c (rs6000_force_indexed_or_indirect_mem): New
helper function.
* config/rs6000/rs6000-protos.h (rs6000_force_indexed_or_indirect_mem):
Prototype for helper function.


Index: gcc/config/rs6000/rs6000-protos.h
===
--- gcc/config/rs6000/rs6000-protos.h   (revision 265588)
+++ gcc/config/rs6000/rs6000-protos.h   (working copy)
@@ -47,6 +47,7 @@
 extern bool legitimate_indirect_address_p (rtx, int);
 extern bool legitimate_indexed_address_p (rtx, int);
 extern bool avoiding_indexed_address_p (machine_mode);
+extern rtx rs6000_force_indexed_or_indirect_mem (rtx x);

 extern rtx rs6000_got_register (rtx);
 extern rtx find_addr_reg (rtx);
Index: gcc/config/rs6000/rs6000.c
===
--- gcc/config/rs6000/rs6000.c  (revision 265588)
+++ gcc/config/rs6000/rs6000.c  (working copy)
@@ -8423,7 +8423,23 @@
   return false;
 }

+/* Helper function for making sure we will make full
+   use of indexed addressing.  */

+rtx
+rs6000_force_indexed_or_indirect_mem (rtx x)
+{
+  machine_mode m = GET_MODE (x);
+  if (!indexed_or_indirect_operand (x, m))
+{
+  rtx addr = XEXP (x, 0);
+  addr = force_reg (Pmode, addr);
+  x = replace_equiv_address_nv (x, addr);
+}
+  return x;
+}
+
+
 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook.  */

 static bool
Index: gcc/config/rs6000/rs6000.md
===
--- gcc/config/rs6000/rs6000.md (revision 265588)
+++ gcc/config/rs6000/rs6000.md (working copy)
@@ -2512,9 +2512,15 @@
   if (TARGET_POWERPC64 && TARGET_LDBRX)
 {
   if (MEM_P (src))
-   emit_insn (gen_bswapdi2_load (dest, src));
+{
+ src = rs6000_force_indexed_or_indirect_mem (src);
+ emit_insn (gen_bswapdi2_load (dest, src));
+}
   else if (MEM_P (dest))
-   emit_insn (gen_bswapdi2_store (dest, src));
+{
+ dest = rs6000_force_indexed_or_indirect_mem (dest);
+ emit_insn (gen_bswapdi2_store (dest, src));
+}
   else if (TARGET_P9_VECTOR)
emit_insn (gen_bswapdi2_xxbrd (dest, src));
   else
@@ -2535,13 +2541,13 @@
 ;; Power7/cell has ldbrx/stdbrx, so use it directly
 (define_insn "bswapdi2_load"
   [(set (match_operand:DI 0 "gpc_reg_operand" "=r")
-   (bswap:DI (match_operand:DI 1 "memory_operand" "Z")))]
+   (bswap:DI (match_operand:DI 1 "indexed_or_indirect_operand" "Z")))]
   "TARGET_POWERPC64 && TARGET_LDBRX"
   "ldbrx %0,%y1"
   [(set_attr "type" "load")])

 (define_insn "bswapdi2_store"
-  [(set (match_operand:DI 0 "memory_operand" "=Z")
+  [(set (match_operand:DI 0 "indexed_or_indirect_operand" "=Z")
(bswap:DI (match_operand:DI 1 "gpc_reg_operand" "r")))]
   "TARGET_POWERPC64 && TARGET_LDBRX"
   "stdbrx %1,%y0"



-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



Re: [PATCH][rs6000] use index form addresses more often for ldbrx/stdbrx

2018-10-29 Thread Aaron Sawdey
On 10/27/18 12:52 PM, Segher Boessenkool wrote:
> Hi Aaron,
> 
> On Sat, Oct 27, 2018 at 11:20:01AM -0500, Aaron Sawdey wrote:
>> --- gcc/config/rs6000/rs6000.md  (revision 265393)
>> +++ gcc/config/rs6000/rs6000.md  (working copy)
>> @@ -2512,9 +2512,27 @@
>>if (TARGET_POWERPC64 && TARGET_LDBRX)
>>  {
>>if (MEM_P (src))
>> -emit_insn (gen_bswapdi2_load (dest, src));
>> +{
>> +  rtx addr = XEXP (src, 0);
>> +  if (!legitimate_indirect_address_p (addr, reload_completed)
>> +  && !legitimate_indexed_address_p (addr, reload_completed))
> 
> Should you use indexed_or_indirect operand instead here?
> 
>> +{
>> +  addr = force_reg (Pmode, addr);
>> +  src = replace_equiv_address_nv (src, addr);
>> +}
>> +  emit_insn (gen_bswapdi2_load (dest, src));
>> +}
> 
> You could maybe make this a utility routine as well (in rs6000.c)...
> Something like force_indexed_or_indirect_mem.  So this code will be just
> 
>   if (MEM_P (src))
>   force_indexed_or_indirect_mem (src);
> 
> then.
> 
> Could you try those things please?
> 
> 
> Segher
> 

Segher,
  Here's a patch restructured in that way.
OK for trunk if bootstrap/regtest passes?

Thanks!
   Aaron

2018-10-29  Aaron Sawdey  

* config/rs6000/rs6000.md (bswapdi2): Force address into register
if not in one already.
(bswapdi2_load): Change predicate to indexed_or_indirect_operand.
(bswapdi2_store): Ditto.
* config/rs6000/rs6000.c (rs6000_force_indexed_or_indirect_mem): New
helper function.
* config/rs6000/rs6000-protos.h (rs6000_force_indexed_or_indirect_mem):
Prototype for helper function.



Index: gcc/config/rs6000/rs6000-protos.h
===
--- gcc/config/rs6000/rs6000-protos.h   (revision 265588)
+++ gcc/config/rs6000/rs6000-protos.h   (working copy)
@@ -47,6 +47,7 @@
 extern bool legitimate_indirect_address_p (rtx, int);
 extern bool legitimate_indexed_address_p (rtx, int);
 extern bool avoiding_indexed_address_p (machine_mode);
+extern void rs6000_force_indexed_or_indirect_mem (rtx x);

 extern rtx rs6000_got_register (rtx);
 extern rtx find_addr_reg (rtx);
Index: gcc/config/rs6000/rs6000.c
===
--- gcc/config/rs6000/rs6000.c  (revision 265588)
+++ gcc/config/rs6000/rs6000.c  (working copy)
@@ -8423,7 +8423,22 @@
   return false;
 }

+/* Helper function for making sure we will make full
+   use of indexed addressing.  */

+void
+rs6000_force_indexed_or_indirect_mem (rtx x)
+{
+  rtx addr = XEXP (x, 0);
+  machine_mode m = GET_MODE (x);
+  if (!indexed_or_indirect_operand (x, m))
+{
+  addr = force_reg (Pmode, addr);
+  x = replace_equiv_address_nv (x, addr);
+}
+}
+
+
 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook.  */

 static bool
Index: gcc/config/rs6000/rs6000.md
===
--- gcc/config/rs6000/rs6000.md (revision 265588)
+++ gcc/config/rs6000/rs6000.md (working copy)
@@ -2512,9 +2512,15 @@
   if (TARGET_POWERPC64 && TARGET_LDBRX)
 {
   if (MEM_P (src))
-   emit_insn (gen_bswapdi2_load (dest, src));
+{
+ rs6000_force_indexed_or_indirect_mem (src);
+ emit_insn (gen_bswapdi2_load (dest, src));
+}
   else if (MEM_P (dest))
-   emit_insn (gen_bswapdi2_store (dest, src));
+{
+ rs6000_force_indexed_or_indirect_mem (dest);
+ emit_insn (gen_bswapdi2_store (dest, src));
+}
   else if (TARGET_P9_VECTOR)
emit_insn (gen_bswapdi2_xxbrd (dest, src));
   else
@@ -2535,13 +2541,13 @@
 ;; Power7/cell has ldbrx/stdbrx, so use it directly
 (define_insn "bswapdi2_load"
   [(set (match_operand:DI 0 "gpc_reg_operand" "=r")
-   (bswap:DI (match_operand:DI 1 "memory_operand" "Z")))]
+   (bswap:DI (match_operand:DI 1 "indexed_or_indirect_operand" "Z")))]
   "TARGET_POWERPC64 && TARGET_LDBRX"
   "ldbrx %0,%y1"
   [(set_attr "type" "load")])

 (define_insn "bswapdi2_store"
-  [(set (match_operand:DI 0 "memory_operand" "=Z")
+  [(set (match_operand:DI 0 "indexed_or_indirect_operand" "=Z")
(bswap:DI (match_operand:DI 1 "gpc_reg_operand" "r")))]
   "TARGET_POWERPC64 && TARGET_LDBRX"
   "stdbrx %1,%y0"



-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



[PATCH][rs6000] use index form addresses more often for ldbrx/stdbrx

2018-10-27 Thread Aaron Sawdey
At Segher's suggestion, I looked into changing the predicates on 
bswapdi2_{load,store}
from memory_operand to indexed_or_indirect_operand and putting some code into 
bswapdi2
to make the address indirect if it wasn't already.

The motivating case for this was the code I was seeing for the gpr expansion of 
strncmp.
Before I would typically see something like this:

addi 9,3,8
ldbrx 10,0,9
addi 9,4,8
ldbrx 8,0,9
subf. 9,8,10
bne 0,.L13
cmpb 10,10,9
cmpdi 0,10,0
bne 0,.L9
addi 9,3,16
ldbrx 10,0,9
addi 9,4,16
ldbrx 8,0,9
subf. 9,8,10
bne 0,.L13
cmpb 10,10,9
cmpdi 0,10,0
bne 0,.L9

For each comparison block it is doing the add separately and using 0 for one 
input
of the ldbrx.

After this change, it is more like this:

ldbrx 8,3,27
ldbrx 7,4,27
cmpb 9,8,9
cmpb 10,8,7
orc. 9,9,10
bne 0,.L13
ldbrx 8,3,24
ldbrx 7,4,24
cmpb 10,8,9
cmpb 9,8,7
orc. 9,10,9
bne 0,.L13


Here it has created temps with constants and hoisted them out of a loop, but I 
have
other cases where it will update them if there is more register pressure. in 
either
case the code is more compact and makes full use of the indexed addressing of 
ldbrx.

Bootstrap/regtest passed on ppc64le targeting power7/power8/power9, ok for 
trunk?

Thanks!
Aaron

2018-10-27  Aaron Sawdey  

* config/rs6000/rs6000.md (bswapdi2): Force address into register
if not in one already.
(bswapdi2_load): Change predicate to indexed_or_indirect_operand.
(bswapdi2_store): Ditto.

Index: gcc/config/rs6000/rs6000.md
===
--- gcc/config/rs6000/rs6000.md (revision 265393)
+++ gcc/config/rs6000/rs6000.md (working copy)
@@ -2512,9 +2512,27 @@
   if (TARGET_POWERPC64 && TARGET_LDBRX)
 {
   if (MEM_P (src))
-   emit_insn (gen_bswapdi2_load (dest, src));
+{
+  rtx addr = XEXP (src, 0);
+  if (!legitimate_indirect_address_p (addr, reload_completed)
+  && !legitimate_indexed_address_p (addr, reload_completed))
+{
+  addr = force_reg (Pmode, addr);
+  src = replace_equiv_address_nv (src, addr);
+}
+ emit_insn (gen_bswapdi2_load (dest, src));
+}
   else if (MEM_P (dest))
-   emit_insn (gen_bswapdi2_store (dest, src));
+{
+  rtx addr = XEXP (dest, 0);
+  if (!legitimate_indirect_address_p (addr, reload_completed)
+  && !legitimate_indexed_address_p (addr, reload_completed))
+{
+  addr = force_reg (Pmode, addr);
+  dest = replace_equiv_address_nv (dest, addr);
+}
+ emit_insn (gen_bswapdi2_store (dest, src));
+}
   else if (TARGET_P9_VECTOR)
emit_insn (gen_bswapdi2_xxbrd (dest, src));
   else
@@ -2535,13 +2553,13 @@
 ;; Power7/cell has ldbrx/stdbrx, so use it directly
 (define_insn "bswapdi2_load"
   [(set (match_operand:DI 0 "gpc_reg_operand" "=r")
-   (bswap:DI (match_operand:DI 1 "memory_operand" "Z")))]
+   (bswap:DI (match_operand:DI 1 "indexed_or_indirect_operand" "Z")))]
   "TARGET_POWERPC64 && TARGET_LDBRX"
   "ldbrx %0,%y1"
   [(set_attr "type" "load")])

 (define_insn "bswapdi2_store"
-  [(set (match_operand:DI 0 "memory_operand" "=Z")
+  [(set (match_operand:DI 0 "indexed_or_indirect_operand" "=Z")
(bswap:DI (match_operand:DI 1 "gpc_reg_operand" "r")))]
   "TARGET_POWERPC64 && TARGET_LDBRX"
   "stdbrx %1,%y0"




-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



[PATCH][rs6000] improve gpr inline expansion of str[n]cmp

2018-10-25 Thread Aaron Sawdey
This patch changes the sequence that gcc generates for inline expansion of
strcmp/strncmp using scalar (gpr) instructions. The new sequence is one
instruction shorter and uses cmpb/cmpb/orc./bne which I also have been
told that valgrind should be able to understand as the defined/undefined
info can be propagated and should show that the branch is not based on
any undefined data past the end of the string.

Performance is mostly a wash. The new sequence is faster if there is a
difference in the string, the old sequence is sligntly faster for short strings
that do not differ. The new sequence is faster for long strings that do not
differ, but that isn't important because if vsx is enabled, the gpr
sequence is only used for 15 bytes or less.

Bootstrap/regtest passes on ppc64le (power8, power9), ppc64 (power8)
and ppc32 (power8). Ok for trunk?

Thanks,
Aaron


2018-10-25  Aaron Sawdey  

* config/rs6000/rs6000-string.c (expand_strncmp_gpr_sequence): Change to
a shorter sequence with fewer branches.
(emit_final_str_compare_gpr): Ditto.


Index: gcc/config/rs6000/rs6000-string.c
===
--- gcc/config/rs6000/rs6000-string.c   (revision 265393)
+++ gcc/config/rs6000/rs6000-string.c   (working copy)
@@ -259,7 +259,7 @@
   gcc_assert (mode == E_QImode);
   emit_move_insn (reg, mem);
   break;
-
+
 default:
   gcc_unreachable ();
   break;
@@ -726,7 +726,7 @@
 {
   if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) > GET_MODE_SIZE (word_mode))
/* Do not expect length longer than word_mode.  */
-   return false;
+   return false;
   else if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) < GET_MODE_SIZE 
(word_mode))
{
  bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx);
@@ -770,7 +770,7 @@
   rtx j;

   /* Example of generated code for 35 bytes aligned 1 byte.
-
+
 mtctr 8
 li 6,0
 li 5,8
@@ -798,7 +798,7 @@
 popcntd 9,9
 subfe 10,10,10
 or 9,9,10
-
+
  Compiled with -fno-reorder-blocks for clarity.  */

   /* Structure of what we're going to do:
@@ -1041,7 +1041,7 @@
   if (!bytes_is_const)
{
  /* If we're dealing with runtime length, we have to check if
-it's zero after the loop. When length is known at compile
+it's zero after the loop.  When length is known at compile
 time the no-remainder condition is dealt with above.  By
 doing this after cleanup_label, we also deal with the
 case where length is 0 at the start and we bypass the
@@ -1411,7 +1411,7 @@
   rtx tmp_reg_src1 = gen_reg_rtx (word_mode);
   rtx tmp_reg_src2 = gen_reg_rtx (word_mode);
   /* P7/P8 code uses cond for subfc. but P9 uses
- it for cmpld which needs CCUNSmode. */
+ it for cmpld which needs CCUNSmode.  */
   rtx cond;
   if (TARGET_P9_MISC)
 cond = gen_reg_rtx (CCUNSmode);
@@ -1655,7 +1655,7 @@
emit_label (convert_label);

   /* We need to produce DI result from sub, then convert to target SI
-while maintaining <0 / ==0 / >0 properties. This sequence works:
+while maintaining <0 / ==0 / >0 properties.  This sequence works:
 subfc L,A,B
 subfe H,H,H
 popcntd L,L
@@ -1740,7 +1740,7 @@
to strcmp/strncmp if we have equality at the end of the inline comparison.
P_CLEANUP_LABEL is a pointer to rtx for a label we generate if we need code
to clean up and generate the final comparison result.
-   FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just
+   FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just
set the final result.  */
 static void
 expand_strncmp_gpr_sequence (unsigned HOST_WIDE_INT bytes_to_compare,
@@ -1763,12 +1763,9 @@
   while (bytes_to_compare > 0)
 {
   /* GPR compare sequence:
- check each 8B with: ld/ld cmpd bne
-If equal, use rldicr/cmpb to check for zero byte.
+ check each 8B with: ld/ld/cmpb/cmpb/orc./bne
+
  cleanup code at end:
- cmpb  get byte that differs
- cmpb  look for zero byte
- orc   combine
  cntlzdget bit of first zero/diff byte
  subficconvert for rldcl use
  rldcl rldcl   extract diff/zero byte
@@ -1776,7 +1773,7 @@

  The last compare can branch around the cleanup code if the
  result is zero because the strings are exactly equal.  */
-
+
   unsigned int align = compute_current_alignment (base_align, offset);
   load_mode = select_block_compare_mode (offset, bytes_to_compare, align);
   load_mode_size = GET_MODE_SIZE (load_mode);
@@ -1801,34 +1798,49 @@
   rid of the extra bytes.  */
cmp_bytes = bytes_to_compare;

-  rtx addr1 = gen_rtx_PLUS (Pmode, src1_addr, GEN_INT (offset));
+  rtx offset_reg

Re: [PATCH][rs6000][PR target/87474] fix strncmp expansion with -mno-power8-vector

2018-10-02 Thread Aaron Sawdey



On 10/2/18 3:38 AM, Segher Boessenkool wrote:
> On Mon, Oct 01, 2018 at 11:09:44PM -0500, Aaron Sawdey wrote:
>> PR/87474 happens because I didn't check that both vector and VSX instructions
>> were enabled, so insns that are disabled get generated with 
>> -mno-power8-vector.
> 
>>  PR target/87474
>>  * config/rs6000/rs6000-string.c (expand_strn_compare): Check that both
>>  vector and VSX are enabled.
> 
> You mean "P8 vector" or similar, I think?

True, it should say TARGET_P[89]_VECTOR.

> 
> 
>> --- gcc/config/rs6000/rs6000-string.c(revision 264760)
>> +++ gcc/config/rs6000/rs6000-string.c(working copy)
>> @@ -2205,6 +2205,7 @@
>>  }
>>else
>>  {
>> +  /* Implies TARGET_P8_VECTOR here. */
> 
> That isn't true as far as I see.

We can only enter emit_final_str_compare_vec() if TARGET_P8_VECTOR is set.
That's the additional check this patch adds. So in this function you can have
both P8 and P9 vector, or just p8 vector. rs6000_option_override_internal()
enforces that P8 vector must be set if P9 vector is set. So in the else here
we know that only p8 vector is set.

> 
> 
> Okay for trunk with improved changelog and that stray line removed.
> Thanks!
> 
> 
> Segher
> 

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



[PATCH][rs6000][PR target/87474] fix strncmp expansion with -mno-power8-vector

2018-10-01 Thread Aaron Sawdey
PR/87474 happens because I didn't check that both vector and VSX instructions
were enabled, so insns that are disabled get generated with -mno-power8-vector.

Regstrap passes on ppc64le, ok for trunk?

Thanks!
  Aaron



2018-10-01  Aaron Sawdey  

PR target/87474
* config/rs6000/rs6000-string.c (expand_strn_compare): Check that both
vector and VSX are enabled.


Index: gcc/config/rs6000/rs6000-string.c
===
--- gcc/config/rs6000/rs6000-string.c   (revision 264760)
+++ gcc/config/rs6000/rs6000-string.c   (working copy)
@@ -2205,6 +2205,7 @@
 }
   else
 {
+  /* Implies TARGET_P8_VECTOR here. */
   rtx diffix = gen_reg_rtx (DImode);
   rtx result_gbbd = gen_reg_rtx (V16QImode);
   /* Since each byte of the input is either 00 or FF, the bytes in
@@ -2313,9 +2314,12 @@
   /* Is it OK to use vec/vsx for this. TARGET_VSX means we have at
  least POWER7 but we use TARGET_EFFICIENT_UNALIGNED_VSX which is
  at least POWER8.  That way we can rely on overlapping compares to
- do the final comparison of less than 16 bytes.  Also I do not want
- to deal with making this work for 32 bits.  */
-  int use_vec = (bytes >= 16 && !TARGET_32BIT && 
TARGET_EFFICIENT_UNALIGNED_VSX);
+ do the final comparison of less than 16 bytes.  Also I do not
+ want to deal with making this work for 32 bits.  In addition, we
+ have to make sure that we have at least P8_VECTOR (we don't allow
+ P9_VECTOR without P8_VECTOR).  */
+  int use_vec = (bytes >= 16 && !TARGET_32BIT
+&& TARGET_EFFICIENT_UNALIGNED_VSX && TARGET_P8_VECTOR);

   if (use_vec)
 required_align = 16;


-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



[PATCH, rs6000] inline expansion of str[n]cmp using vec/vsx instructions

2018-08-22 Thread Aaron Sawdey
This patch teaches rs6000 inline expansion of strcmp/strncmp how to
generate vector/vsx code for power8/power9 processors. Compares 16
bytes and longer are generated using the vector code, which is
considerably faster than the gpr based code. 

Bootstrap/regtest passes on ppc64 (power8) and ppc64le (power8 and
power9). Ok for trunk?

Thanks!
Aaron


2018-08-22  Aaron Sawdey  

* config/rs6000/altivec.md (altivec_eq): Remove star.
* config/rs6000/rs6000-string.c (do_load_for_compare): Support
vector load modes.
(expand_strncmp_vec_sequence): New function.
(emit_final_str_compare_vec): New function.
(expand_strn_compare): Support for vector strncmp.
* config/rs6000/rs6000.opt (-mstring-compare-inline-limit): Change
length specification to bytes.
* config/rs6000/vsx.md (vsx_ld_elemrev_v16qi_internal): Remove star.
(vcmpnezb_p): New pattern.
* doc/invoke.texi (RS/6000 and PowerPC Options): Update documentation
for option -mstring-compare-inline-limit.

Index: gcc/config/rs6000/altivec.md
===
--- gcc/config/rs6000/altivec.md(revision 263753)
+++ gcc/config/rs6000/altivec.md(working copy)
@@ -603,7 +603,7 @@
   "vcmpbfp %0,%1,%2"
   [(set_attr "type" "veccmp")])
 
-(define_insn "*altivec_eq"
+(define_insn "altivec_eq"
   [(set (match_operand:VI2 0 "altivec_register_operand" "=v")
(eq:VI2 (match_operand:VI2 1 "altivec_register_operand" "v")
(match_operand:VI2 2 "altivec_register_operand" "v")))]
@@ -2304,7 +2304,7 @@
 
 ;; Compare vectors producing a vector result and a predicate, setting CR6 to
 ;; indicate a combined status
-(define_insn "*altivec_vcmpequ_p"
+(define_insn "altivec_vcmpequ_p"
   [(set (reg:CC CR6_REGNO)
(unspec:CC [(eq:CC (match_operand:VI2 1 "register_operand" "v")
   (match_operand:VI2 2 "register_operand" "v"))]
Index: gcc/config/rs6000/rs6000-string.c
===
--- gcc/config/rs6000/rs6000-string.c   (revision 263753)
+++ gcc/config/rs6000/rs6000-string.c   (working copy)
@@ -157,6 +157,29 @@
 {
   switch (GET_MODE (reg))
 {
+case E_V16QImode:
+  switch (mode) {
+  case E_V16QImode:
+   if (!BYTES_BIG_ENDIAN) 
+ if (TARGET_P9_VECTOR)
+   emit_insn (gen_vsx_ld_elemrev_v16qi_internal (reg, mem));
+ else
+   {
+ rtx reg_v2di = simplify_gen_subreg (V2DImode, reg, V16QImode, 0);
+ gcc_assert (MEM_P (mem));
+ rtx addr = XEXP (mem, 0);
+ rtx mem_v2di = gen_rtx_MEM (V2DImode, addr);
+ MEM_COPY_ATTRIBUTES (mem_v2di, mem);
+ set_mem_size (mem, GET_MODE_SIZE (V2DImode));
+ emit_insn (gen_vsx_ld_elemrev_v2di (reg_v2di, mem_v2di));
+   }
+   else
+ emit_insn (gen_vsx_movv2di_64bit (reg, mem));
+   break;
+  default:
+   gcc_unreachable ();
+  }
+  break;
 case E_DImode:
   switch (mode)
{
@@ -227,7 +250,22 @@
  gcc_unreachable ();
}
   break;
+
+case E_QImode:
+  switch (mode)
+   {
+   case E_QImode:
+ emit_move_insn (reg, mem);
+ break;
+   default:
+ debug_rtx(reg);
+ gcc_unreachable ();
+ break;
+   }
+  break;
+  
 default:
+  debug_rtx(reg);
   gcc_unreachable ();
   break;
 }
@@ -1878,6 +1916,175 @@
 
 }
 
+/* Generate the sequence of compares for strcmp/strncmp using vec/vsx 
+   instructions.
+
+   BYTES_TO_COMPARE is the number of bytes to be compared.
+   ORIG_SRC1 is the unmodified rtx for the first string.
+   ORIG_SRC2 is the unmodified rtx for the second string.
+   S1ADDR is the register to use for the base address of the first string.
+   S2ADDR is the register to use for the base address of the second string.
+   OFF_REG is the register to use for the string offset for loads.
+   S1DATA is the register for loading the first string.
+   S2DATA is the register for loading the second string.
+   VEC_RESULT is the rtx for the vector result indicating the byte difference.
+   EQUALITY_COMPARE_REST is a flag to indicate we need to make a cleanup call
+   to strcmp/strncmp if we have equality at the end of the inline comparison.
+   CLEANUP_LABEL is rtx for a label we generate if we need code to clean up
+   and generate the final comparison result.
+   FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just 
+   set the final result.  */
+static void
+expand_strncmp_vec_sequence(unsigned HOST_WIDE_INT bytes_to_compare,
+   rtx orig_src1, rtx orig_src2,
+

[PATCH, rs6000] refactor/cleanup in rs6000-string.c

2018-07-31 Thread Aaron Sawdey
Just teasing things apart a bit more in this function so I can add
vec/vsx code generation without making it enormous and
incomprehensible.

Bootstrap/regtest passes on powerpc64le, ok for trunk?

Thanks,
Aaron


2018-07-31  Aaron Sawdey  

* config/rs6000/rs6000-string.c (select_block_compare_mode): Move test
for word_mode_ok here instead of passing as argument.
(expand_block_compare): Change select_block_compare_mode() call.
(expand_strncmp_gpr_sequence): New function.
(expand_strn_compare): Make use of expand_strncmp_gpr_sequence.

Index: gcc/config/rs6000/rs6000-string.c
===
--- gcc/config/rs6000/rs6000-string.c   (revision 263039)
+++ gcc/config/rs6000/rs6000-string.c   (working copy)
@@ -238,13 +238,11 @@
 
OFFSET is the current read offset from the beginning of the block.
BYTES is the number of bytes remaining to be read.
-   ALIGN is the minimum alignment of the memory blocks being compared in bytes.
-   WORD_MODE_OK indicates using WORD_MODE is allowed, else SImode is
-   the largest allowable mode.  */
+   ALIGN is the minimum alignment of the memory blocks being compared in 
bytes.  */
 static machine_mode
 select_block_compare_mode (unsigned HOST_WIDE_INT offset,
   unsigned HOST_WIDE_INT bytes,
-  unsigned HOST_WIDE_INT align, bool word_mode_ok)
+  unsigned HOST_WIDE_INT align)
 {
   /* First see if we can do a whole load unit
  as that will be more efficient than a larger load + shift.  */
@@ -257,6 +255,11 @@
   /* The most we can read without potential page crossing.  */
   unsigned HOST_WIDE_INT maxread = ROUND_UP (bytes, align);
 
+  /* If we have an LE target without ldbrx and word_mode is DImode,
+ then we must avoid using word_mode.  */
+  int word_mode_ok = !(!BYTES_BIG_ENDIAN && !TARGET_LDBRX
+  && word_mode == DImode);
+
   if (word_mode_ok && bytes >= UNITS_PER_WORD)
 return word_mode;
   else if (bytes == GET_MODE_SIZE (SImode))
@@ -1382,16 +1385,11 @@
   else
 cond = gen_reg_rtx (CCmode);
 
-  /* If we have an LE target without ldbrx and word_mode is DImode,
- then we must avoid using word_mode.  */
-  int word_mode_ok = !(!BYTES_BIG_ENDIAN && !TARGET_LDBRX
-  && word_mode == DImode);
-
   /* Strategy phase.  How many ops will this take and should we expand it?  */
 
   unsigned HOST_WIDE_INT offset = 0;
   machine_mode load_mode =
-select_block_compare_mode (offset, bytes, base_align, word_mode_ok);
+select_block_compare_mode (offset, bytes, base_align);
   unsigned int load_mode_size = GET_MODE_SIZE (load_mode);
 
   /* We don't want to generate too much code.  The loop code can take
@@ -1445,8 +1443,7 @@
   while (bytes > 0)
 {
   unsigned int align = compute_current_alignment (base_align, offset);
-  load_mode = select_block_compare_mode (offset, bytes,
-align, word_mode_ok);
+  load_mode = select_block_compare_mode (offset, bytes, align);
   load_mode_size = GET_MODE_SIZE (load_mode);
   if (bytes >= load_mode_size)
cmp_bytes = load_mode_size;
@@ -1698,6 +1695,189 @@
   LABEL_NUSES (strncmp_label) += 1;
 }
 
+/* Generate the sequence of compares for strcmp/strncmp using gpr instructions.
+   BYTES_TO_COMPARE is the number of bytes to be compared.
+   BASE_ALIGN is the smaller of the alignment of the two strings.
+   ORIG_SRC1 is the unmodified rtx for the first string.
+   ORIG_SRC2 is the unmodified rtx for the second string.
+   TMP_REG_SRC1 is the register for loading the first string.
+   TMP_REG_SRC2 is the register for loading the second string.
+   RESULT_REG is the rtx for the result register.
+   EQUALITY_COMPARE_REST is a flag to indicate we need to make a cleanup call
+   to strcmp/strncmp if we have equality at the end of the inline comparison.
+   CLEANUP_LABEL is rtx for a label we generate if we need code to clean up
+   and generate the final comparison result.
+   FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just 
+   set the final result.  */
+static void
+expand_strncmp_gpr_sequence(unsigned HOST_WIDE_INT bytes_to_compare,
+   unsigned int base_align,
+   rtx orig_src1, rtx orig_src2,
+   rtx tmp_reg_src1, rtx tmp_reg_src2, rtx result_reg,
+   bool equality_compare_rest, rtx _label,
+   rtx final_move_label)
+{
+  unsigned int word_mode_size = GET_MODE_SIZE (word_mode);
+  machine_mode load_mode;
+  unsigned int load_mode_size;
+  unsigned HOST_WIDE_INT cmp_bytes = 0;
+  unsigned HOST_WIDE_INT offset = 0;
+  rtx src1_addr = force_reg (Pmode, XEXP (orig_src1, 0));
+  rtx src2_addr = force_reg (Pmode, XEXP (orig_src2, 0));
+
+  while (bytes_to_c

[PATCH, rs6000] don't use unaligned vsx for memset of less than 32 bytes

2018-06-25 Thread Aaron Sawdey
In gcc 8 I added support for unaligned vsx in the builtin expansion of
memset(x,0,y). Turns out that for memset of less than 32 bytes, this
doesn't really help much, and it also runs into an egregious load-hit-
store case in CPU2006 components gcc and hmmer.

This patch reverts to the previous (gcc 7) behavior for memset of 16-31 
bytes, which is to use vsx stores only if the target is 16 byte
aligned. For 32 bytes or more, unaligned vsx stores will still be used.
  Performance testing of the memset expansion shows that not much is
given up by using scalar stores for 16-31 bytes, and CPU2006 runs show
the performance regression is fixed.

Regstrap passes on powerpc64le, ok for trunk and backport to 8?

Thanks,
   Aaron

2018-06-25  Aaron Sawdey  

* config/rs6000/rs6000-string.c (expand_block_clear): Don't use
unaligned vsx for 16B memset.


-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC ToolchainIndex: gcc/config/rs6000/rs6000-string.c
===
--- gcc/config/rs6000/rs6000-string.c	(revision 261808)
+++ gcc/config/rs6000/rs6000-string.c	(working copy)
@@ -90,7 +90,9 @@
   machine_mode mode = BLKmode;
   rtx dest;
 
-  if (bytes >= 16 && TARGET_ALTIVEC && (align >= 128 || TARGET_EFFICIENT_UNALIGNED_VSX))
+  if (TARGET_ALTIVEC
+	  && ((bytes >= 16 && align >= 128)
+	  || (bytes >= 32 && TARGET_EFFICIENT_UNALIGNED_VSX)))
 	{
 	  clear_bytes = 16;
 	  mode = V4SImode;


[PATCH, rs6000] PR target/86222 fix truncation issue with constants when compiling -m32

2018-06-21 Thread Aaron Sawdey
expand_strn_compare was not using gen_int_mode or trunc_int_for_mode to
properly truncate to Pmode when creating contants in the generate rtx.
This lead to an improper constant and the ICE in PR/86222.

Testing on ppc64 with -m32, -m32 -mpowerpc64 and -m64. If regstrap
passes, ok for trunk and backport to 8?

Thanks, 
   Aaron


2018-06-19  Aaron Sawdey  

* config/rs6000/rs6000-string.c (expand_strn_compare): Handle -m32
correctly.

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC ToolchainIndex: gcc/config/rs6000/rs6000-string.c
===
--- gcc/config/rs6000/rs6000-string.c	(revision 261850)
+++ gcc/config/rs6000/rs6000-string.c	(working copy)
@@ -1925,20 +1925,15 @@
 	  /* -m32 -mpowerpc64 results in word_mode being DImode even
 	 though otherwise it is 32-bit. The length arg to strncmp
 	 is a size_t which will be the same size as pointers.  */
-	  rtx len_rtx;
-	  if (TARGET_64BIT)
-	len_rtx = gen_reg_rtx (DImode);
-	  else
-	len_rtx = gen_reg_rtx (SImode);
+	  rtx len_rtx = gen_reg_rtx (Pmode);
+	  emit_move_insn (len_rtx, gen_int_mode (bytes, Pmode));
 
-	  emit_move_insn (len_rtx, bytes_rtx);
-
 	  tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP);
 	  emit_library_call_value (XEXP (DECL_RTL (fun), 0),
    target, LCT_NORMAL, GET_MODE (target),
    force_reg (Pmode, src1_addr), Pmode,
    force_reg (Pmode, src2_addr), Pmode,
-   len_rtx, GET_MODE (len_rtx));
+   len_rtx, Pmode);
 	}
 
   rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
@@ -2126,18 +2121,12 @@
 	}
   else
 	{
-	  rtx len_rtx;
-	  if (TARGET_64BIT)
-	len_rtx = gen_reg_rtx (DImode);
-	  else
-	len_rtx = gen_reg_rtx (SImode);
-
-	  emit_move_insn (len_rtx, GEN_INT (bytes - compare_length));
+	  rtx len_rtx = gen_reg_rtx (Pmode);
+	  emit_move_insn (len_rtx, gen_int_mode (bytes-compare_length, Pmode));
 	  tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP);
 	  emit_library_call_value (XEXP (DECL_RTL (fun), 0),
    target, LCT_NORMAL, GET_MODE (target),
-   src1, Pmode, src2, Pmode,
-   len_rtx, GET_MODE (len_rtx));
+   src1, Pmode, src2, Pmode, len_rtx, Pmode);
 	}
 
   rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);


[PATCH, rs6000] cleanup/refactor in rs6000-string.c

2018-06-14 Thread Aaron Sawdey
This patch cleans up and refactors some stuff in rs6000-string.c
before I start working on adding vec/vsx support to str[n]cmp inline
expansion. Also removes the * from vsx_mov_64bit in vsx.md
because I'll be using that pattern to generate lxvd2x.

Bootstrap/regtest passes on ppc64le power8 -- ok for trunk?

Thanks!
   Aaron

2018-06-14  Aaron Sawdey  

* config/rs6000/rs6000-string.c (select_block_compare_mode): Check
TARGET_EFFICIENT_OVERLAPPING_UNALIGNED here instead of in caller.
(do_and3, do_and3_mask, do_compb3, do_rotl3): New functions.
(expand_block_compare): Change select_block_compare_mode call.
(expand_strncmp_align_check): Use new functions, fix comment.
(emit_final_str_compare_gpr): New function.
(expand_strn_compare): Refactor and clean up code.
* config/rs6000/vsx.md (vsx_mov_64bit): Remove *.


-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC ToolchainIndex: rs6000-string.c
===
--- rs6000-string.c	(revision 261573)
+++ rs6000-string.c	(working copy)
@@ -264,6 +264,7 @@
   else if (bytes == GET_MODE_SIZE (QImode))
 return QImode;
   else if (bytes < GET_MODE_SIZE (SImode)
+	   && TARGET_EFFICIENT_OVERLAPPING_UNALIGNED
 	   && offset >= GET_MODE_SIZE (SImode) - bytes)
 /* This matches the case were we have SImode and 3 bytes
and offset >= 1 and permits us to move back one and overlap
@@ -271,6 +272,7 @@
unwanted bytes off of the input.  */
 return SImode;
   else if (word_mode_ok && bytes < UNITS_PER_WORD
+	   && TARGET_EFFICIENT_OVERLAPPING_UNALIGNED
 	   && offset >= UNITS_PER_WORD-bytes)
 /* Similarly, if we can use DImode it will get matched here and
can do an overlapping read that ends at the end of the block.  */
@@ -406,6 +408,70 @@
 emit_insn (gen_addsi3 (dest, src1, src2));
 }
 
+/* Emit an and of the proper mode for DEST.
+
+   DEST is the destination register for the and.
+   SRC1 is the first and input.
+   SRC2 is the second and input.
+
+   Computes DEST = SRC1  */
+static void
+do_and3 (rtx dest, rtx src1, rtx src2)
+{
+  if (GET_MODE (dest) == DImode)
+emit_insn (gen_anddi3 (dest, src1, src2));
+  else
+emit_insn (gen_andsi3 (dest, src1, src2));
+}
+
+/* Emit an and-mask of the proper mode for DEST.
+
+   DEST is the destination register for the and.
+   SRC1 is the first and input.
+   SRC2 is the mask input.
+
+   Computes DEST = SRC1  */
+static void
+do_and3_mask (rtx dest, rtx src1, rtx src2)
+{
+  if (GET_MODE (dest) == DImode)
+emit_insn (gen_anddi3_mask (dest, src1, src2));
+  else
+emit_insn (gen_andsi3_mask (dest, src1, src2));
+}
+
+/* Emit an cmpb of the proper mode for DEST.
+
+   DEST is the destination register for the cmpb.
+   SRC1 is the first input.
+   SRC2 is the second input.
+
+   Computes cmpb of SRC1, SRC2.  */
+static void
+do_cmpb3 (rtx dest, rtx src1, rtx src2)
+{
+  if (GET_MODE (dest) == DImode)
+emit_insn (gen_cmpbdi3 (dest, src1, src2));
+  else
+emit_insn (gen_cmpbsi3 (dest, src1, src2));
+}
+
+/* Emit a rotl of the proper mode for DEST.
+
+   DEST is the destination register for the and.
+   SRC1 is the first and input.
+   SRC2 is the second and input.
+
+   Computes DEST = SRC1 rotated left by SRC2.  */
+static void
+do_rotl3 (rtx dest, rtx src1, rtx src2)
+{
+  if (GET_MODE (dest) == DImode)
+emit_insn (gen_rotldi3 (dest, src1, src2));
+  else
+emit_insn (gen_rotlsi3 (dest, src1, src2));
+}
+
 /* Generate rtl for a load, shift, and compare of less than a full word.
 
LOAD_MODE is the machine mode for the loads.
@@ -1393,11 +1459,8 @@
   while (bytes > 0)
 {
   unsigned int align = compute_current_alignment (base_align, offset);
-  if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
-	load_mode = select_block_compare_mode (offset, bytes, align,
-	   word_mode_ok);
-  else
-	load_mode = select_block_compare_mode (0, bytes, align, word_mode_ok);
+  load_mode = select_block_compare_mode (offset, bytes,
+	 align, word_mode_ok);
   load_mode_size = GET_MODE_SIZE (load_mode);
   if (bytes >= load_mode_size)
 	cmp_bytes = load_mode_size;
@@ -1625,22 +1688,19 @@
   return true;
 }
 
-/* Generate alignment check and branch code to set up for
+/* Generate page crossing check and branch code to set up for
strncmp when we don't have DI alignment.
STRNCMP_LABEL is the label to branch if there is a page crossing.
-   SRC is the string pointer to be examined.
+   SRC_ADDR is the string address to be examined.
BYTES is the max number of bytes to compare.  */
 static void
-expand_strncmp_align_check (rtx strncmp_label, rtx src, HOST_WIDE_INT bytes)
+expand_strncmp_align_check (rtx strncmp_label, rtx src_addr, HOST_WIDE_INT bytes)
 {
   rtx lab_ref = 

Re: [PATCH] rs6000 PR83660 fix ICE with vec_extract

2018-04-23 Thread Aaron Sawdey
This also affects gcc 7 and is fixed by the same patch. I've tested the
backport to 7 on ppc64le and it causes no new fails. OK for backport to
7 (and 6 if it's also needed there)?

Thanks,
   Aaron


On Fri, 2018-04-13 at 15:37 -0500, Aaron Sawdey wrote:
> Per the discussion on the 83660, I've come to a minimal patch to
> prevent this. Basically marking the vec_extract tree as having side
> effects later makes sure that it gets all the cleanup points it needs
> so that gimplify_cleanup_point_expr () is happy.  Also because
> vec_insert puts a MODIFY_EXPR in there, it has side effects and this
> problem will not occur.
> 
> Doing bootstrap/regtest on ppc64le with -mcpu=power7 since that is
> where this issue arises. OK for trunk if everything passes?
> 
> Thanks,
>Aaron
> 
> 
> 2018-04-13  Aaron Sawdey  <acsaw...@linux.ibm.com>
> 
>   PR target/83660
>   * config/rs6000/rs6000-c.c
> (altivec_resolve_overloaded_builtin): Mark
>   vec_extract expression as having side effects to make sure it
> gets
>   a cleanup point.
> 
> 2018-04-13  Aaron Sawdey  <acsaw...@linux.ibm.com>
> 
>   PR target/83660
>   * gcc.target/powerpc/pr83660.C: New test.
> 
-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



[PATCH] rs6000 PR83660 fix ICE with vec_extract

2018-04-13 Thread Aaron Sawdey
Per the discussion on the 83660, I've come to a minimal patch to
prevent this. Basically marking the vec_extract tree as having side
effects later makes sure that it gets all the cleanup points it needs
so that gimplify_cleanup_point_expr () is happy.  Also because
vec_insert puts a MODIFY_EXPR in there, it has side effects and this
problem will not occur.

Doing bootstrap/regtest on ppc64le with -mcpu=power7 since that is
where this issue arises. OK for trunk if everything passes?

Thanks,
   Aaron


2018-04-13  Aaron Sawdey  <acsaw...@linux.ibm.com>

PR target/83660
* config/rs6000/rs6000-c.c (altivec_resolve_overloaded_builtin): Mark
vec_extract expression as having side effects to make sure it gets
a cleanup point.

2018-04-13  Aaron Sawdey  <acsaw...@linux.ibm.com>

PR target/83660
* gcc.target/powerpc/pr83660.C: New test.

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC ToolchainIndex: config/rs6000/rs6000-c.c
===
--- config/rs6000/rs6000-c.c	(revision 259353)
+++ config/rs6000/rs6000-c.c	(working copy)
@@ -6705,6 +6705,15 @@
   stmt = build_binary_op (loc, PLUS_EXPR, stmt, arg2, 1);
   stmt = build_indirect_ref (loc, stmt, RO_NULL);
 
+  /* PR83660: We mark this as having side effects so that
+	 downstream in fold_build_cleanup_point_expr () it will get a
+	 CLEANUP_POINT_EXPR.  If it does not we can run into an ICE
+	 later in gimplify_cleanup_point_expr ().  Potentially this
+	 causes missed optimization because the actually is no side
+	 effect.  */
+  if (c_dialect_cxx ())
+	TREE_SIDE_EFFECTS (stmt) = 1;
+
   return stmt;
 }
 
Index: testsuite/gcc.target/powerpc/pr83660.C
===
--- testsuite/gcc.target/powerpc/pr83660.C	(nonexistent)
+++ testsuite/gcc.target/powerpc/pr83660.C	(working copy)
@@ -0,0 +1,14 @@
+/* PR target/83660 */
+/* { dg-do compile } */
+/* { dg-options "-mcpu=power7" } */
+
+#include 
+
+typedef __vector unsigned int  uvec32_t  __attribute__((__aligned__(16)));
+
+unsigned get_word(uvec32_t v)
+{
+return ({const unsigned _B1 = 32;
+vec_extract((uvec32_t)v, 2);});
+}
+


[PATCH, rs6000] PR85321 improve documentation of -mcall and -mtraceback=

2018-04-10 Thread Aaron Sawdey
Another update to document -mcall- and -mtraceback= options. Cleanup to
remove -mabi={no-,}spe from the RS/6000 and PowerPC section. And a trim
to the help text for -mblock-compare-* and -mstring-compare-inline-
limit so they are not excessively long. The complete description for
those is now in invoke.texi. This is the last piece for 85321.

Testing in progress on linux-ppc64le, ok for trunk if tests are ok?

Thanks,
   Aaron

2018-04-10  Aaron Sawdey  <acsaw...@linux.ibm.com>

PR target/85321
* doc/invoke.texi (RS/6000 and PowerPC Options): Document options
-mcall= and -mtraceback. Remove options -mabi=spe and -mabi=no-spe
from PowerPC section.
* config/rs6000/sysv4.opt (mcall): Improve help text.
* config/rs6000/rs6000.opt (mblock-compare-inline-limit=): Trim
help text that is too long.
* config/rs6000/rs6000.opt (mblock-compare-inline-loop-limit=): Trim
help text that is too long.
* config/rs6000/rs6000.opt (mstring-compare-inline-limit=): Trim
help text that is too long.

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC ToolchainIndex: doc/invoke.texi
===
--- doc/invoke.texi	(revision 259302)
+++ doc/invoke.texi	(working copy)
@@ -1076,7 +1076,10 @@
 -mprioritize-restricted-insns=@var{priority} @gol
 -msched-costly-dep=@var{dependence_type} @gol
 -minsert-sched-nops=@var{scheme} @gol
--mcall-sysv  -mcall-netbsd @gol
+-mcall-aixdesc  -mcall-eabi  -mcall-freebsd  @gol
+-mcall-linux  -mcall-netbsd  -mcall-openbsd  @gol
+-mcall-sysv  -mcall-sysv-eabi  -mcall-sysv-noeabi @gol
+-mtraceback=@var{traceback_type} @gol
 -maix-struct-return  -msvr4-struct-return @gol
 -mabi=@var{abi-type}  -msecure-plt  -mbss-plt @gol
 -mblock-move-inline-limit=@var{num} @gol
@@ -23957,6 +23960,12 @@
 On System V.4 and embedded PowerPC systems compile code for the
 OpenBSD operating system.
 
+@item -mtraceback=@var{traceback_type}
+@opindex mtraceback
+Select the type of traceback table. Valid values for @var{traceback_type}
+are @samp{full}, @samp{part},
+and @samp{no}.
+
 @item -maix-struct-return
 @opindex maix-struct-return
 Return all structures in memory (as specified by the AIX ABI)@.
@@ -23973,16 +23982,6 @@
 @samp{no-spe}, @samp{ibmlongdouble}, @samp{ieeelongdouble},
 @samp{elfv1}, @samp{elfv2}@.
 
-@item -mabi=spe
-@opindex mabi=spe
-Extend the current ABI with SPE ABI extensions.  This does not change
-the default ABI, instead it adds the SPE ABI extensions to the current
-ABI@.
-
-@item -mabi=no-spe
-@opindex mabi=no-spe
-Disable Book-E SPE ABI extensions for the current ABI@.
-
 @item -mabi=ibmlongdouble
 @opindex mabi=ibmlongdouble
 Change the current ABI to use IBM extended-precision long double.
Index: config/rs6000/sysv4.opt
===
--- config/rs6000/sysv4.opt	(revision 259301)
+++ config/rs6000/sysv4.opt	(working copy)
@@ -21,7 +21,7 @@
 
 mcall-
 Target RejectNegative Joined Var(rs6000_abi_name)
-Select ABI calling convention.
+-mcall=ABI	Select ABI calling convention.
 
 msdata=
 Target RejectNegative Joined Var(rs6000_sdata_name)
Index: config/rs6000/rs6000.opt
===
--- config/rs6000/rs6000.opt	(revision 259301)
+++ config/rs6000/rs6000.opt	(working copy)
@@ -335,15 +335,15 @@
 
 mblock-compare-inline-limit=
 Target Report Var(rs6000_block_compare_inline_limit) Init(31) RejectNegative Joined UInteger Save
-Specify the maximum number of bytes to compare inline with non-looping code. If this is set to 0, all inline expansion (non-loop and loop) of memcmp is disabled.
+Specify the maximum number of bytes to compare inline with non-looping code.
 
 mblock-compare-inline-loop-limit=
 Target Report Var(rs6000_block_compare_inline_loop_limit) Init(-1) RejectNegative Joined UInteger Save
-Specify the maximum number of bytes to compare inline with loop code generation.  If the length is not known at compile time, memcmp will be called after this many bytes are compared. By default, a length will be picked depending on the tuning target.
+Specify the maximum number of bytes to compare inline with loop code generation.
 
 mstring-compare-inline-limit=
 Target Report Var(rs6000_string_compare_inline_limit) Init(8) RejectNegative Joined UInteger Save
-Specify the maximum number pairs of load instructions that should be generated inline for the compare.  If the number needed exceeds the limit, a call to strncmp will be generated instead.
+Specify the maximum number pairs of load instructions that should be generated for inline compares.
 
 misel
 Target Report Mask(ISEL) Var(rs6000_isa_flags)


[PATCH, committed] Update my MAINTAINERS entry

2018-04-10 Thread Aaron Sawdey
Update to my new email address. Committed as 259301.

2018-04-10  Aaron Sawdey  <acsaw...@linux.ibm.com>

* MAINTAINERS: Update my email address.

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC ToolchainIndex: MAINTAINERS
===
--- MAINTAINERS	(revision 259295)
+++ MAINTAINERS	(working copy)
@@ -568,7 +568,7 @@
 Duncan Sands	<baldr...@gcc.gnu.org>
 Sujoy Saraswati	<sujoy.sarasw...@hpe.com>
 Trevor Saunders	<tbsaunde+...@tbsaunde.org>
-Aaron Sawdey	<acsaw...@linux.vnet.ibm.com>
+Aaron Sawdey	<acsaw...@linux.ibm.com>
 Roger Sayle	<ro...@eyesopen.com>
 Will Schmidt	<will_schm...@vnet.ibm.com>
 Tilo Schwarz	<t...@tilo-schwarz.de>


[PATCH rs6000: document options (PR85321)

2018-04-10 Thread Aaron Sawdey
This updates invoke.texi to document -mblock-compare-inline-limit,
-mblock-compare-inline-loop-limit, and -mstring-compare-inline-limit.

Tested with "make pdf", ok for trunk?

2018-04-10  Aaron Sawdey  <acsaw...@linux.ibm.com>

PR target/85321
* doc/invoke.texi (RS/6000 and PowerPC Options): Document options
-mblock-compare-inline-limit, -mblock-compare-inline-loop-limit,
and -mstring-compare-inline-limit.

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC ToolchainIndex: gcc/doc/invoke.texi
===
--- gcc/doc/invoke.texi	(revision 259295)
+++ gcc/doc/invoke.texi	(working copy)
@@ -1080,6 +1080,9 @@
 -maix-struct-return  -msvr4-struct-return @gol
 -mabi=@var{abi-type}  -msecure-plt  -mbss-plt @gol
 -mblock-move-inline-limit=@var{num} @gol
+-mblock-compare-inline-limit=@var{num} @gol
+-mblock-compare-inline-loop-limit=@var{num} @gol
+-mstring-compare-inline-limit=@var{num} @gol
 -misel  -mno-isel @gol
 -misel=yes  -misel=no @gol
 -mpaired @gol
@@ -24142,6 +24145,31 @@
 @var{num} is 32 bytes on 32-bit targets and 64 bytes on 64-bit
 targets.  The default value is target-specific.
 
+@item -mblock-compare-inline-limit=@var{num}
+@opindex mblock-compare-inline-limit
+Generate non-looping inline code for all block compares (such as calls
+to @code{memcmp} or structure compares) less than or equal to @var{num}
+bytes. If @var{num} is 0, all inline expansion (non-loop and loop) of
+block compare is disabled. The default value is target-specific.
+
+@item -mblock-compare-inline-loop-limit=@var{num}
+@opindex mblock-compare-inline-loop-limit
+Generate an inline expansion using loop code for all block compares that
+are less than or equal to @var{num} bytes, but greater than the limit
+for non-loop inline block compare expansion. If the block length is not
+constant, at most @var{num} bytes will be compared before @code{memcmp}
+is called to compare the remainder of the block. The default value is
+target-specific.
+
+@item -mstring-compare-inline-limit=@var{num}
+@opindex mstring-compare-inline-limit
+Generate at most @var{num} pairs of load instructions to compare the
+string inline. If the difference or end of string is not found at the
+end of the inline compare a call to @code{strcmp} or @code{strncmp} will
+take care of the rest of the comparison. The default is 8 pairs of
+loads, which will compare 64 bytes on a 64-bit target and 32 bytes on a
+32-bit target.
+
 @item -G @var{num}
 @opindex G
 @cindex smaller data references (PowerPC)


[PATCH, rs6000] PR target/83822 fix redundant conditions

2018-03-29 Thread Aaron Sawdey
I've fixed the redundant conditions in the expressions pointed out by
83822. Bootstrap/regtest passes on ppc64le, ok for trunk?

Aaron


2018-03-29  Aaron Sawdey  <acsaw...@linux.vnet.ibm.com>

PR target/83822
* config/rs6000/rs6000-string.c (expand_compare_loop): Fix redundant
condition.
* config/rs6000/rs6000-c.c (rs6000_cpu_cpp_builtins): Fix redundant
condition.

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC ToolchainIndex: gcc/config/rs6000/rs6000-c.c
===
--- gcc/config/rs6000/rs6000-c.c	(revision 258900)
+++ gcc/config/rs6000/rs6000-c.c	(working copy)
@@ -642,8 +642,7 @@
 	  cpp_get_callbacks (pfile)->macro_to_expand = rs6000_macro_to_expand;
 	}
 }
-  if (!TARGET_HARD_FLOAT
-  || (TARGET_HARD_FLOAT && !TARGET_DOUBLE_FLOAT))
+  if (!TARGET_HARD_FLOAT || !TARGET_DOUBLE_FLOAT)
 builtin_define ("_SOFT_DOUBLE");
   /* Used by lwarx/stwcx. errata work-around.  */
   if (rs6000_cpu == PROCESSOR_PPC405)
Index: gcc/config/rs6000/rs6000-string.c
===
--- gcc/config/rs6000/rs6000-string.c	(revision 258900)
+++ gcc/config/rs6000/rs6000-string.c	(working copy)
@@ -966,8 +966,7 @@
   rtx final_cleanup = gen_label_rtx ();
   rtx cmp_rem_before = gen_reg_rtx (word_mode);
   /* Compare one more word_mode chunk if needed.  */
-  if (!bytes_is_const
-	  || (bytes_is_const && bytes_remaining >= load_mode_size))
+  if (!bytes_is_const || bytes_remaining >= load_mode_size)
 	{
 	  /* If remainder length < word length, branch to final
 	 cleanup compare.  */


PR target/84743 adjust reassociation widths for power8/power9

2018-03-12 Thread Aaron Sawdey
Looking at CPU2017 results for different reassociation widths, things
have shifted since I last looked at this with CPU2006 in early gcc7
timeframe. Best thing to do seems to be to set reassociation width to 1
for all integer modes, which is what the attached patch does.

I also tried setting width to 1 for float modes PLUS_EXPR as this patch
did for aarch64 but this does not seem to be helpful for power8.
https://gcc.gnu.org/ml/gcc-patches/2018-02/msg01271.html


Results below are % performance improvement on power8 comparing trunk
with the attached patch vs trunk with --param tree-reassoc-width=1 to
disable parallel reassociation for everything (first column of results)
and trunk unmodified (second column of results). 

CPU2017 componentvs width=1   vs trunk
500.perlbench_r-0.36% -0.15%
502.gcc_r   0.06%  0.04%
505.mcf_r   0.32%  0.24%
520.omnetpp_r   0.57% -0.95%
523.xalancbmk_r 1.45%  1.04%
525.x264_r -0.05%  0.09%
531.deepsjeng_r 0.04%  0.09%
541.leela_r 0.10%  0.72%
548.exchange2_r 0.08%  0.73%
557.xz_r0.09%  2.12%
CPU2017 int geo mean0.23%  0.40%
503.bwaves_r0.00%  0.01%
507.cactuBSSN_r 0.05% -0.02%
508.namd_r  0.00%  0.00%
510.parest_r   -0.01%  0.20%
511.povray_r0.03% -0.24%
519.lbm_r  -0.04% -0.16%
521.wrf_r  -0.01% -0.56%
526.blender_r  -0.82% -0.47%
527.cam4_r -0.18%  0.06%
538.imagick_r  -0.02%  0.01%
544.nab_r   0.00%  0.23%
549.fotonik3d_r 0.24%  0.54%
554.roms_r -0.05%  0.03%
CPU2017 fp geo mean-0.06% -0.03%

Bottom line is net improvement for CPU2017 int compared with either
current trunk, or disabling parallel reassociation. For CPU2017 fp,
very small overall degradation. 

Currently doing regstrap on ppc64le, ok for trunk if results look good?

Thanks!
   Aaron

2018-03-12  Aaron Sawdey  <acsaw...@linux.vnet.ibm.com>

PR target/84743
* config/rs6000/rs6000.c (rs6000_reassociation_width): Disable parallel
reassociation for int modes.


-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC ToolchainIndex: gcc/config/rs6000/rs6000.c
===
--- gcc/config/rs6000/rs6000.c	(revision 258101)
+++ gcc/config/rs6000/rs6000.c	(working copy)
@@ -10006,7 +10006,7 @@
   if (VECTOR_MODE_P (mode))
 	return 4;
   if (INTEGRAL_MODE_P (mode)) 
-	return opc == MULT_EXPR ? 4 : 6;
+	return 1;
   if (FLOAT_MODE_P (mode))
 	return 4;
   break;


Re: [PATCH][AArch64] PR84114: Avoid reassociating FMA

2018-02-27 Thread Aaron Sawdey
On Tue, 2018-02-27 at 14:21 +, Wilco Dijkstra wrote:
> Richard Biener <richard.guent...@gmail.com>
> 
> > It happens that on some targets doing two FMAs in parallel and one
> > non-FMA operation merging them is faster than chaining three
> > FMAs...
> 
> Like I mentioned in the PR, long chains should be broken, but for
> that we need a new parameter to state how long a chain may be before
> it is split. The issue today is that it splits even very short
> chains, removing beneficial FMAs.
> 
> > But yes, somewhere I suggested that FMA detection should/could be
> > integrated with reassociation.

I'd also like to see some work here. 

Doing two FMA in parallel and then a non-FMA merge is faster on ppc,
but it would be nice if the target had some more control of exactly how
this happens.

Also doing parallel reassociation increases register pressure so it
would be nice to be able to avoid causing issues as a result of that.

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



Re: [PATCH, rs6000][PR debug/83758] v2 rs6000_internal_arg_pointer should only return a register

2018-01-30 Thread Aaron Sawdey
On Tue, 2018-01-30 at 14:04 +0100, Jakub Jelinek wrote:
> On IRC when discussing it with Segher this morning we've come to the
> conclusion that it would be best if rs6000 just followed what all
> other
> ports to, i.e. return a pseudo from the target hook, like:
> 
> --- gcc/config/rs6000/rs6000.c2018-01-30 12:30:27.416360076
> +0100
> +++ gcc/config/rs6000/rs6000.c2018-01-30 13:59:07.360639803
> +0100
> @@ -29602,8 +29602,9 @@ rs6000_internal_arg_pointer (void)
> emit_insn_after (pat, get_insns ());
> pop_topmost_sequence ();
>   }
> -  return plus_constant (Pmode, cfun->machine-
> >split_stack_arg_pointer,
> - FIRST_PARM_OFFSET
> (current_function_decl));
> +  rtx ret = plus_constant (Pmode, cfun->machine-
> >split_stack_arg_pointer,
> +FIRST_PARM_OFFSET
> (current_function_decl));
> +  return copy_to_reg (ret);
>  }
>return virtual_incoming_args_rtx;
>  }
> 
> copy_to_reg is what e.g. the generic or pa target hook conditionally
> uses.

This fix looks good, passes bootstrap, go tests run. 

Segher is currently regtesting on ppc64le power9. OK for trunk if tests
pass?

2018-01-30  Aaron Sawdey  <acsaw...@linux.vnet.ibm.com>

    * config/rs6000/rs6000.c (rs6000_internal_arg_pointer ): Only return
a reg rtx.
-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC ToolchainIndex: gcc/config/rs6000/rs6000.c
===
--- gcc/config/rs6000/rs6000.c	(revision 257188)
+++ gcc/config/rs6000/rs6000.c	(working copy)
@@ -29602,8 +29602,9 @@
 	  emit_insn_after (pat, get_insns ());
 	  pop_topmost_sequence ();
 	}
-  return plus_constant (Pmode, cfun->machine->split_stack_arg_pointer,
-			FIRST_PARM_OFFSET (current_function_decl));
+  rtx ret = plus_constant (Pmode, cfun->machine->split_stack_arg_pointer,
+			   FIRST_PARM_OFFSET (current_function_decl));
+  return copy_to_reg (ret);
 }
   return virtual_incoming_args_rtx;
 }


Re: [PATCH][PR debug/83758] look more carefully for internal_arg_pointer in vt_add_function_parameter()

2018-01-30 Thread Aaron Sawdey
On Tue, 2018-01-30 at 11:13 +0100, Jakub Jelinek wrote:
> On Tue, Jan 30, 2018 at 03:55:58AM -0600, Segher Boessenkool wrote:
> > > But in that case, what does the copying?
> > 
> > I don't know.  Aaron will look at it, but timezones etc. :-)

Indeed I did see unshare_all_rtl() copying internal_arg_pointer. But
also several places in function.c:

assign_parm_adjust_entry_rtl:
  move_block_from_reg (REGNO (entry_parm),
   validize_mem (copy_rtx (stack_parm)),
   data->partial / UNITS_PER_WORD);

assign_parm_setup_reg:
  /* Copy the value into the register, thus bridging between
 assign_parm_find_data_types and expand_expr_real_1.  */

  equiv_stack_parm = data->stack_parm;
  validated_mem = validize_mem (copy_rtx (data->entry_parm));

assign_parm_setup_block:
  mem = validize_mem (copy_rtx (stack_parm));

in expr.c:

expand_expr_real_1:
  /* DECL_MODE might change when TYPE_MODE depends on attribute target
 settings for VECTOR_TYPE_P that might switch for the function.  */
  if (currently_expanding_to_rtl
  && code == VAR_DECL && MEM_P (decl_rtl)
  && VECTOR_TYPE_P (type) && exp && DECL_MODE (exp) != mode)
decl_rtl = change_address (decl_rtl, TYPE_MODE (type), 0);
  else
decl_rtl = copy_rtx (decl_rtl);


> > 
> > > That's what seems strange.  I can see why we'd have two nested
> > > pluses with the inner plus being pointer-equal to
> > > internal_arg_ptr.
> > > And I can see why we'd have a single canonical plus (which IMO
> > > would
> > > be better, but I agree it's not stage 4 material).  It's having
> > > the two
> > > nested pluses without maintaining pointer equality that seems
> > > strange.
> > 
> > The inner plus is *not* pointer-equal, that is the
> > problem.  Something
> > did copy_rtx (or such) on it, many things do.  We can tell you what
> > exactly later today.
> 
> Most likely unshare_all_rtl, which does:
>   for (tree decl = DECL_ARGUMENTS (cfun->decl); decl; decl =
> DECL_CHAIN (decl))
> {
>   if (DECL_RTL_SET_P (decl))
> SET_DECL_RTL (decl, copy_rtx_if_shared (DECL_RTL (decl)));
>   DECL_INCOMING_RTL (decl) = copy_rtx_if_shared
> (DECL_INCOMING_RTL (decl));
> }
> 
> Anyway, my preference would be to change that gen_rtx_PLUS into
>   stack_parm = crtl->args.internal_arg_pointer;
>   if (!CONST_INT_P (offset_rtx))
> stack_parm = gen_rtx_PLUS (Pmode, stack_parm, offset_rtx);
>   else if (offset_rtx != const0_rtx)
> stack_parm = plus_constant (Pmode, stack_parm, INTVAL
> (offset_rtx));
>   stack_parm = gen_rtx_MEM (data->promoted_mode, stack_parm);
> and deal specially with GET_CODE (crtl->args.internal_arg_pointer)
> in var-tracking.c.
> rs6000/powerpcspe with -fsplit-stack are the only cases where
> crtl->args.internal_arg_pointer is not a REG, so just running libgo
> testsuite on powerpc{,64,64le} should cover it all.

I'll give this a try today when I get to the office.

Thanks,
Aaron


> 
>   Jakub
> 
-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



[PATCH][PR debug/83758] look more carefully for internal_arg_pointer in vt_add_function_parameter()

2018-01-29 Thread Aaron Sawdey
This bug appears to revolve around whether there is a canonical rtx for
internal_arg_pointer in var-tracking. In vt_add_function_parameter() we
 currently have:

static void
vt_add_function_parameter (tree parm)
{
  rtx decl_rtl = DECL_RTL_IF_SET (parm);
  rtx incoming = DECL_INCOMING_RTL (parm);
  tree decl;
  machine_mode mode;
  poly_int64 offset;
  dataflow_set *out;
  decl_or_value dv;

  if (TREE_CODE (parm) != PARM_DECL)
return;

  if (!decl_rtl || !incoming)
return;

  if (GET_MODE (decl_rtl) == BLKmode || GET_MODE (incoming) == BLKmode)
return;

  /* If there is a DRAP register or a pseudo in internal_arg_pointer,
 rewrite the incoming location of parameters passed on the stack
 into MEMs based on the argument pointer, so that incoming doesn't
 depend on a pseudo.  */
  if (MEM_P (incoming)
  && (XEXP (incoming, 0) == crtl->args.internal_arg_pointer
  || (GET_CODE (XEXP (incoming, 0)) == PLUS
  && XEXP (XEXP (incoming, 0), 0)
 == crtl->args.internal_arg_pointer
  && CONST_INT_P (XEXP (XEXP (incoming, 0), 1)
{
  HOST_WIDE_INT off = -FIRST_PARM_OFFSET (current_function_decl);
  if (GET_CODE (XEXP (incoming, 0)) == PLUS)
off += INTVAL (XEXP (XEXP (incoming, 0), 1));
  incoming
= replace_equiv_address_nv (incoming,
plus_constant (Pmode,
   arg_pointer_rtx, off));
}


This is looking for crtl->args.internal_arg_pointer within rtx
incoming. The problem I am seeing is that the same rtx is there, just
not as a pointer to the identical rtx objects, so is not found by the
== comparison in the current code. So hence my patch below to switch
from == to rtx_equal_p(). If the expression is not rewritten, then the
pseudo created for the stack pointer is not preserved and later we run
into the assert near the beginning of vt_expand_var_loc_chain().

Bootstrap now passes for languages=c,c++,go on ppc64le. If
bootstrap/regtest is ok on ppc64le and x86_64, ok for trunk?


2018-01-29  Aaron Sawdey  <acsaw...@linux.vnet.ibm.com>

* var-tracking.c (vt_add_function_parameter): Fix comparison of rtx.


Index: gcc/var-tracking.c
===
--- gcc/var-tracking.c  (revision 257159)
+++ gcc/var-tracking.c  (working copy)
@@ -9668,10 +9668,10 @@
  into MEMs based on the argument pointer, so that incoming doesn't
  depend on a pseudo.  */
   if (MEM_P (incoming)
-  && (XEXP (incoming, 0) == crtl->args.internal_arg_pointer
+  && (rtx_equal_p (XEXP (incoming, 0), crtl-
>args.internal_arg_pointer)
  || (GET_CODE (XEXP (incoming, 0)) == PLUS
- && XEXP (XEXP (incoming, 0), 0)
-== crtl->args.internal_arg_pointer
+ && rtx_equal_p (XEXP (XEXP (incoming, 0), 0),
+ crtl->args.internal_arg_pointer)
  && CONST_INT_P (XEXP (XEXP (incoming, 0), 1)
 {
   HOST_WIDE_INT off = -FIRST_PARM_OFFSET (current_function_decl);


-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



[PATCH] reduce runtime of gcc.dg/memcmp-1.c test

2018-01-10 Thread Aaron Sawdey
This brings it back not quite to where it was but a lot more reasonable
than what I put into 256351.

2018-01-10  Aaron Sawdey  <acsaw...@linux.vnet.ibm.com>

* gcc.dg/memcmp-1.c: Reduce runtime to something reasonable.

OK for trunk?

Thanks, 
   Aaron


-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC ToolchainIndex: /home/sawdey/src/gcc/trunk/trunk/gcc/testsuite/gcc.dg/memcmp-1.c
===
--- /home/sawdey/src/gcc/trunk/trunk/gcc/testsuite/gcc.dg/memcmp-1.c	(revision 256437)
+++ /home/sawdey/src/gcc/trunk/trunk/gcc/testsuite/gcc.dg/memcmp-1.c	(working copy)
@@ -12,8 +12,20 @@
 int lib_strncmp(const char *a, const char *b, size_t n) asm("strncmp");
 
 #ifndef NRAND
+#ifdef TEST_ALL
 #define NRAND 1
+#else
+#define NRAND 500
 #endif
+#endif
+#ifndef TZONE
+#ifdef TEST_ALL
+#define TZONE 16
+#else
+#define TZONE 8
+#endif
+#endif
+
 #define MAX_SZ 600
 
 #define DEF_RS(ALIGN)  \
@@ -33,9 +45,7 @@
 	  b = four+i*ALIGN+j*(4096-2*i*ALIGN);   \
 	  memcpy(a,str1,sz);		   \
 	  memcpy(b,str2,sz);		   \
-	  asm(" ");			   \
 	  r = memcmp(a,b,sz);		   \
-	  asm(" ");			   \
 	  if ( r < 0 && !(expect < 0) ) abort();			   \
 	  if ( r > 0 && !(expect > 0) )	abort();			   \
 	  if ( r == 0 && !(expect == 0) ) abort();			   \
@@ -67,15 +77,13 @@
 	{
 	  for (a1=0; a1 < 2*sizeof(void *); a1++)
 	{
+	  a = three+i*a1+j*(4096-2*i*a1);
+	  memcpy(a,str1,sz);
 	  for (a2=0; a2 < 2*sizeof(void *); a2++)
 		{
-		  a = three+i*a1+j*(4096-2*i*a1);
 		  b = four+i*a2+j*(4096-2*i*a2);
-		  memcpy(a,str1,sz);
 		  memcpy(b,str2,sz);
-		  asm(" ");
 		  r = memcmp(a,b,sz);
-		  asm(" ");
 		  if ( r < 0 && !(expect < 0) ) abort();
 		  if ( r > 0 && !(expect > 0) )	abort();
 		  if ( r == 0 && !(expect == 0) ) abort();
@@ -89,7 +97,7 @@
 void (test_strncmp)(const char *, const char *, int),
   size_t sz, int align)
 {
-  char buf1[MAX_SZ*2+10],buf2[MAX_SZ*2+10];
+  char buf1[MAX_SZ*2+TZONE],buf2[MAX_SZ*2+TZONE];
   size_t test_sz = (sz<MAX_SZ)?sz:MAX_SZ;
   size_t diff_pos, zero_pos;
   uint32_t e;
@@ -111,8 +119,8 @@
   (*test_strncmp)(buf1,buf2,e);
 }
   }
-  for(diff_pos = ((test_sz>10)?(test_sz-10):0); diff_pos < test_sz+10; diff_pos++)
-for(zero_pos = ((test_sz>10)?(test_sz-10):0); zero_pos < test_sz+10; zero_pos++)
+  for(diff_pos = ((test_sz>TZONE)?(test_sz-TZONE):0); diff_pos < test_sz+TZONE; diff_pos++)
+for(zero_pos = ((test_sz>TZONE)?(test_sz-TZONE):0); zero_pos < test_sz+TZONE; zero_pos++)
   {
 	memset(buf1, 'A', 2*test_sz);
 	memset(buf2, 'A', 2*test_sz);
@@ -125,7 +133,6 @@
 	(*test_memcmp)(buf2,buf2,0);
 	test_memcmp_runtime_size (buf1, buf2, sz, e);
 	test_memcmp_runtime_size (buf2, buf1, sz, -e);
-	test_memcmp_runtime_size (buf2, buf2, sz, 0);
 	e = lib_strncmp(buf1,buf2,sz);
 	(*test_strncmp)(buf1,buf2,e);
 	(*test_strncmp)(buf2,buf1,-e);
@@ -470,10 +477,8 @@
 DEF_TEST(9,1)
 DEF_TEST(16,1)
 DEF_TEST(32,1)
-DEF_TEST(100,1)
-DEF_TEST(100,8)
-DEF_TEST(180,1)
-DEF_TEST(180,8)
+DEF_TEST(33,8)
+DEF_TEST(49,1)
 #endif
 
 int
@@ -753,9 +758,7 @@
 RUN_TEST(9,1)
 RUN_TEST(16,1)
 RUN_TEST(32,1)
-RUN_TEST(100,1)
-RUN_TEST(100,8)
-RUN_TEST(180,1)
-RUN_TEST(180,8)
+RUN_TEST(33,8)
+RUN_TEST(49,1)
 #endif
 }


Re: [PATCH, rs6000] generate loop code for memcmp inline expansion

2018-01-10 Thread Aaron Sawdey
I'll check the runtime of that --- I added some test cases to memcmp-
1.c and probably it is now taking too long. I will revise it so it's no
 longer than it was before.

  Aaron

On Wed, 2018-01-10 at 14:25 +, Szabolcs Nagy wrote:
> On 08/01/18 19:37, Aaron Sawdey wrote:
> > On Tue, 2017-12-12 at 10:13 -0600, Segher Boessenkool wrote:
> > > > Please fix those trivialities, and it's okay for trunk (after
> > > > the
> > > > rtlanal patch is approved too).  Thanks!
> > 
> > Here's the final version of this, which is committed as 256351.
> > 
> > 
> > 2018-01-08  Aaron Sawdey  <acsaw...@linux.vnet.ibm.com>
> > 
> > * config/rs6000/rs6000-string.c
> > (do_load_for_compare_from_addr): New
> > function.
> > (do_ifelse): New function.
> > (do_isel): New function.
> > (do_sub3): New function.
> > (do_add3): New function.
> > (do_load_mask_compare): New function.
> > (do_overlap_load_compare): New function.
> > (expand_compare_loop): New function.
> > (expand_block_compare): Call expand_compare_loop() when
> > appropriate.
> > * config/rs6000/rs6000.opt (-mblock-compare-inline-limit):
> > Change
> > option description.
> > (-mblock-compare-inline-loop-limit): New option.
> > 
> 
> ...
> > Index: gcc/testsuite/gcc.dg/memcmp-1.c
> > ===
> > --- gcc/testsuite/gcc.dg/memcmp-1.c (revision 256350)
> > +++ gcc/testsuite/gcc.dg/memcmp-1.c (working copy)
> > @@ -14,11 +14,80 @@
> >  #ifndef NRAND
> >  #define NRAND 1
> >  #endif
> > -#define MAX_SZ 200
> > +#define MAX_SZ 600
> >  
> 
> i see timeouts when running aarch64-none-elf tests in some
> emulator environments:
> 
> WARNING: program timed out.
> FAIL: gcc.dg/memcmp-1.c execution test
> 
> if there is a way to reduce the iteration count or the
> tested variants that would help slow targets.
> 
> > +#define
> > DEF_RS(ALIGN)  
> > \
> > +static void test_memcmp_runtime_size_ ## ALIGN (const char *str1, 
> >\
> > +   const char *str2,   
> >\
> > +   size_t sz, int
> > expect)\
> > +{  
> >\
> > +  char three[8192] __attribute__ ((aligned (4096)));   
> >\
> > +  char four[8192] __attribute__ ((aligned (4096)));
> >\
> > +  char *a, *b; 
> >\
> > +  int i,j,a1,a2,r; 
> >\
> > +  for (j = 0; j < 2; j++)  
> >\
> > +{  
> >\
> > +  for (i = 0; i < 2; i++)  
> >\
> > +   {   
> >\
> > + a = three+i*ALIGN+j*(4096-2*i*ALIGN); 
> >\
> > + b = four+i*ALIGN+j*(4096-2*i*ALIGN);  
> >\
> > + memcpy(a,str1,sz);
> >\
> > + memcpy(b,str2,sz);
> >\
> > + asm(" "); 
> >\
> > + r = memcmp(a,b,sz);   
> >\
> > + asm(" "); 
> >\
> > + if ( r < 0 && !(expect < 0) ) abort();
> >\
> > + if ( r > 0 && !(expect > 0) ) abort();
> >\
> > + if ( r == 0 && !(expect == 0) ) abort();  
> >\
> > +   }   
> >\
> > +}  
> >\
> > +}
> > +
> > +DEF_RS(1)
> > +DEF_RS(2)
> > +DEF_RS(4)
> > +DEF_RS(8)
> > +DEF_RS(16)
> > +
> > +static void test_memcmp_runtime_size (const char *str1, const char
> > *str2,
> > + size_t sz, int expect)
> > +{
> > +  char three[8192] __attribute__ ((aligned (4096)));
> &

Re: [PATCH, rs6000] generate loop code for memcmp inline expansion

2018-01-08 Thread Aaron Sawdey
On Tue, 2017-12-12 at 10:13 -0600, Segher Boessenkool wrote:
> Please fix those trivialities, and it's okay for trunk (after the
> rtlanal patch is approved too).  Thanks!

Here's the final version of this, which is committed as 256351.


2018-01-08  Aaron Sawdey  <acsaw...@linux.vnet.ibm.com>

* config/rs6000/rs6000-string.c (do_load_for_compare_from_addr): New
function.
(do_ifelse): New function.
(do_isel): New function.
(do_sub3): New function.
(do_add3): New function.
(do_load_mask_compare): New function.
(do_overlap_load_compare): New function.
(expand_compare_loop): New function.
(expand_block_compare): Call expand_compare_loop() when appropriate.
* config/rs6000/rs6000.opt (-mblock-compare-inline-limit): Change
option description.
(-mblock-compare-inline-loop-limit): New option.


-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC ToolchainIndex: gcc/config/rs6000/rs6000-string.c
===
--- gcc/config/rs6000/rs6000-string.c	(revision 256350)
+++ gcc/config/rs6000/rs6000-string.c	(working copy)
@@ -303,6 +303,959 @@
   return MIN (base_align, offset & -offset);
 }
 
+/* Prepare address and then do a load.
+
+   MODE is the mode to use for the load.
+   DEST is the destination register for the data.
+   ADDR is the address to be loaded.
+   ORIG_ADDR is the original address expression.  */
+static void
+do_load_for_compare_from_addr (machine_mode mode, rtx dest, rtx addr,
+			   rtx orig_addr)
+{
+  rtx mem = gen_rtx_MEM (mode, addr);
+  MEM_COPY_ATTRIBUTES (mem, orig_addr);
+  set_mem_size (mem, GET_MODE_SIZE (mode));
+  do_load_for_compare (dest, mem, mode);
+  return;
+}
+
+/* Do a branch for an if/else decision.
+
+   CMPMODE is the mode to use for the comparison.
+   COMPARISON is the rtx code for the compare needed.
+   A is the first thing to be compared.
+   B is the second thing to be compared.
+   CR is the condition code reg input, or NULL_RTX.
+   TRUE_LABEL is the label to branch to if the condition is true.
+
+   The return value is the CR used for the comparison.
+   If CR is null_rtx, then a new register of CMPMODE is generated.
+   If A and B are both null_rtx, then CR must not be null, and the
+   compare is not generated so you can use this with a dot form insn.  */
+
+static void
+do_ifelse (machine_mode cmpmode, rtx_code comparison,
+	   rtx a, rtx b, rtx cr, rtx true_label)
+{
+  gcc_assert ((a == NULL_RTX && b == NULL_RTX && cr != NULL_RTX)
+	  || (a != NULL_RTX && b != NULL_RTX));
+
+  if (cr != NULL_RTX)
+gcc_assert (GET_MODE (cr) == cmpmode);
+  else
+cr = gen_reg_rtx (cmpmode);
+
+  rtx label_ref = gen_rtx_LABEL_REF (VOIDmode, true_label);
+
+  if (a != NULL_RTX)
+emit_move_insn (cr, gen_rtx_COMPARE (cmpmode, a, b));
+
+  rtx cmp_rtx = gen_rtx_fmt_ee (comparison, VOIDmode, cr, const0_rtx);
+
+  rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, label_ref, pc_rtx);
+  rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
+  JUMP_LABEL (j) = true_label;
+  LABEL_NUSES (true_label) += 1;
+}
+
+/* Emit an isel of the proper mode for DEST.
+
+   DEST is the isel destination register.
+   SRC1 is the isel source if CR is true.
+   SRC2 is the isel source if CR is false.
+   CR is the condition for the isel.  */
+static void
+do_isel (rtx dest, rtx cmp, rtx src_t, rtx src_f, rtx cr)
+{
+  if (GET_MODE (dest) == DImode)
+emit_insn (gen_isel_signed_di (dest, cmp, src_t, src_f, cr));
+  else
+emit_insn (gen_isel_signed_si (dest, cmp, src_t, src_f, cr));
+}
+
+/* Emit a subtract of the proper mode for DEST.
+
+   DEST is the destination register for the subtract.
+   SRC1 is the first subtract input.
+   SRC2 is the second subtract input.
+
+   Computes DEST = SRC1-SRC2.  */
+static void
+do_sub3 (rtx dest, rtx src1, rtx src2)
+{
+  if (GET_MODE (dest) == DImode)
+emit_insn (gen_subdi3 (dest, src1, src2));
+  else
+emit_insn (gen_subsi3 (dest, src1, src2));
+}
+
+/* Emit an add of the proper mode for DEST.
+
+   DEST is the destination register for the add.
+   SRC1 is the first add input.
+   SRC2 is the second add input.
+
+   Computes DEST = SRC1+SRC2.  */
+static void
+do_add3 (rtx dest, rtx src1, rtx src2)
+{
+  if (GET_MODE (dest) == DImode)
+emit_insn (gen_adddi3 (dest, src1, src2));
+  else
+emit_insn (gen_addsi3 (dest, src1, src2));
+}
+
+/* Generate rtl for a load, shift, and compare of less than a full word.
+
+   LOAD_MODE is the machine mode for the loads.
+   DIFF is the reg for the difference.
+   CMP_REM is the reg containing the remaining bytes to compare.
+   DCOND is the CCUNS reg for the compare if we are doing P9 code with setb.
+   SRC1_ADDR is the first source address.
+   SRC2_ADDR is the second source address.
+   ORIG_SRC1 is t

  1   2   >