Re: [PATCH, rs6000] PR87532: Bad results from vec_extract (unsigned char, foo) dependent upon function inline

2019-04-09 Thread Segher Boessenkool
Hi Kelvin,

On Tue, Apr 09, 2019 at 08:15:31AM -0500, Kelvin Nilsen wrote:
> This new patch addresses the code generation problems that were uncovered by 
> these failing tests.  Additionally, this new patch corrects some of the 
> expected instruction counts for certain previously existing regression tests 
> on certain targets to adjust for changes in the generated code.

Is the newly generated code better though?  Worse?  More expected?  It isn't
clear to me.

> This new patch has been bootstrapped and tested without regressions on 
> powerpcle-unknown-linux (both P8 and P9) and on powerpc-linux (P7 and P8, 
> both -m32 and -m64).

powerpcle-linux is a very different configuration than the powerpc64le-linux
you mean (powerpc64-linux and powerpc-linux are biarch to each other, but
the LE variants are not).  Very minor...  It's just that powerpcle-linux
makes me cringe.

>   * gcc.target/powerpc/fold-vec-extract-short.p8.c: Likewise.:

(stray colon)

> +// P8 LE variable offset: rldicl, subfic, sldi, mtvsrd, xxpermdi, vslo, 
> mfvsrd, sradi, rlwin, (extsb)

rlwinm

> -/* { dg-final { scan-assembler-times {\mrlwinm\M} 2 { target lp64} } } */
> +/* { dg-final { scan-assembler-times {\mrlwinm\M} 4 { target lp64} } } */

Put a space after lp64 while you're here?  Or remove the one before target,
whichever you like best.


I'm a bit worried that these tests change instruction counts so often, what
that means for maintainability.  But, okay for trunk, and for backports (but
please make sure they generate the correct counts for all targets there :-/ )


Segher


[PATCH, rs6000] PR87532: Bad results from vec_extract (unsigned char, foo) dependent upon function inline

2019-04-09 Thread Kelvin Nilsen
A patch to address this problem report was committed on 3/15/2019.  Some of the 
new regressions tests submitted with that initial patch failed on P8 big-endian 
and on P9 little-endian.

This new patch addresses the code generation problems that were uncovered by 
these failing tests.  Additionally, this new patch corrects some of the 
expected instruction counts for certain previously existing regression tests on 
certain targets to adjust for changes in the generated code.

This new patch has been bootstrapped and tested without regressions on 
powerpcle-unknown-linux (both P8 and P9) and on powerpc-linux (P7 and P8, both 
-m32 and -m64).

Is this ok for trunk and backports?

Thanks.

gcc/ChangeLog:

2019-04-09  Kelvin Nilsen  

PR target/87532
* config/rs6000/rs6000.c (rs6000_split_vec_extract_var): Use inner
mode of vector rather than mode of destination for move instruction.
* config/rs6000/vsx.md (*vsx_extract__mode_var):
Use QI inner mode with V16QI vector mode.

gcc/testsuite/ChangeLog:

2019-04-09  Kelvin Nilsen  

PR target/87532
* gcc.target/powerpc/fold-vec-extract-char.p8.c: Adjust expected
instruction counts.
* gcc.target/powerpc/fold-vec-extract-int.p8.c: Likewise.
* gcc.target/powerpc/fold-vec-extract-short.p8.c: Likewise.:

Index: gcc/config/rs6000/rs6000.c
===
--- gcc/config/rs6000/rs6000.c  (revision 270127)
+++ gcc/config/rs6000/rs6000.c  (working copy)
@@ -7167,7 +7167,7 @@
  rtx tmp_altivec)
 {
   machine_mode mode = GET_MODE (src);
-  machine_mode scalar_mode = GET_MODE (dest);
+  machine_mode scalar_mode = GET_MODE_INNER (GET_MODE (src));
   unsigned scalar_size = GET_MODE_SIZE (scalar_mode);
   int byte_shift = exact_log2 (scalar_size);
 
Index: gcc/config/rs6000/vsx.md
===
--- gcc/config/rs6000/vsx.md(revision 270127)
+++ gcc/config/rs6000/vsx.md(working copy)
@@ -3739,9 +3739,9 @@
   DONE;
 })
 
-(define_insn_and_split "*vsx_extract___var"
-  [(set (match_operand:SDI 0 "gpc_reg_operand" "=r,r,r")
-   (zero_extend:SDI
+(define_insn_and_split "*vsx_extract__mode_var"
+  [(set (match_operand: 0 "gpc_reg_operand" "=r,r,r")
+   (zero_extend:
 (unspec:
  [(match_operand:VSX_EXTRACT_I 1 "input_operand" "wK,v,m")
   (match_operand:DI 2 "gpc_reg_operand" "r,r,r")]
@@ -3753,7 +3753,7 @@
   "&& reload_completed"
   [(const_int 0)]
 {
-  machine_mode smode = mode;
+  machine_mode smode = mode;
   rs6000_split_vec_extract_var (gen_rtx_REG (smode, REGNO (operands[0])),
operands[1], operands[2],
operands[3], operands[4]);
Index: gcc/testsuite/gcc.target/powerpc/fold-vec-extract-char.p8.c
===
--- gcc/testsuite/gcc.target/powerpc/fold-vec-extract-char.p8.c (revision 
270127)
+++ gcc/testsuite/gcc.target/powerpc/fold-vec-extract-char.p8.c (working copy)
@@ -6,9 +6,9 @@
 /* { dg-options "-mdejagnu-cpu=power8 -O2" } */
 
 // six tests total. Targeting P8LE / P8BE.
-// P8 LE variable offset: rldicl, subfic, sldi, mtvsrd, xxpermdi, vslo, 
mfvsrd, sradi, (extsb)
+// P8 LE variable offset: rldicl, subfic, sldi, mtvsrd, xxpermdi, vslo, 
mfvsrd, sradi, rlwin, (extsb)
 // P8 LE constant offset: vspltb, mfvsrd, rlwinm, (extsb)
-// P8 BE variable offset: sldi, mtvsrd, xxpermdi, vslo, 
mfvsrd, sradi, (extsb)
+// P8 BE variable offset: sldi, mtvsrd, xxpermdi, vslo, 
mfvsrd, sradi, rlwinm, (extsb)
 // P8 BE constant offset: vspltb, mfvsrd, rlwinm, (extsb)
 
 /* { dg-final { scan-assembler-times {\mrldicl\M} 3 { target { le } } } } */
@@ -21,7 +21,7 @@
 /* { dg-final { scan-assembler-times {\msrdi\M} 3 { target lp64 } } } */
 /* { dg-final { scan-assembler-times "extsb" 2 } } */
 /* { dg-final { scan-assembler-times {\mvspltb\M} 3 { target lp64 } } } */
-/* { dg-final { scan-assembler-times {\mrlwinm\M} 2 { target lp64} } } */
+/* { dg-final { scan-assembler-times {\mrlwinm\M} 4 { target lp64} } } */
 
 /* multiple codegen variations for -m32. */
 /* { dg-final { scan-assembler-times {\mrlwinm\M} 3 { target ilp32} } } */
Index: gcc/testsuite/gcc.target/powerpc/fold-vec-extract-int.p8.c
===
--- gcc/testsuite/gcc.target/powerpc/fold-vec-extract-int.p8.c  (revision 
270127)
+++ gcc/testsuite/gcc.target/powerpc/fold-vec-extract-int.p8.c  (working copy)
@@ -7,14 +7,14 @@
 
 // Targeting P8 (LE) and (BE).  6 tests total.
 // P8 LE constant:  vspltw, mfvsrwz, (1:extsw/2:rldicl)
-// P8 LE variables: rldicl, subfic,  sldi, mtvsrd, xxpermdi, vslo, mfvsrd, 
sradi, (1:extsw)
+// P8 LE variables: subfic,  sldi, mtvsrd, xxpermdi, vslo, mfvsrd, sradi, 
(1:extsw/5:rldicl))
 // P8 BE constant:  vspltw, mfvsrwz, (1:extsw/2:rldicl)
-// P8 

Re: [PATCH, rs6000] PR87532: Bad Results from vec_extract(unsigned char, foo) dependent upon function inline

2019-03-08 Thread Segher Boessenkool
Hi Kelvin,

On Fri, Mar 08, 2019 at 10:59:35AM -0600, Kelvin Nilsen wrote:
> This problem report, though initially motivated by differences in behavior 
> between constant and non-constant selector arguments, uncovered a number of 
> inconsistencies in the implementation of vec_extract.
> 
> This patch provides several fixes to make handling of constant selector 
> expressions the same as the handling of non-constant selector expressions.  
> In the process of testing, it was observed that certain existing regression 
> tests were looking for the wrong instructions to be emitted and those tests 
> have been updated.
> 
> This has bootstrapped and tested without regressions on 
> powerpc64le-unknown-linux (both P8 and P9) and on powerpc-linux (P7 
> big-endian, with both -m32 and -m64 target options).

Thanks for the patch.  I have lots of comments/questions, mostly about the
testcases.  Sorry :-/  The actual compiler code part looks good though.


> --- gcc/testsuite/gcc.target/powerpc/vsx-builtin-10a.c(revision 0)
> +++ gcc/testsuite/gcc.target/powerpc/vsx-builtin-10a.c(revision 0)
> @@ -0,0 +1,157 @@
> +/* { dg-do run { target { powerpc*-*-* } } } */
> +/* { dg-skip-if "" { powerpc*-*-darwin* } } */

Is there any reason to think this testcase will fail on Darwin?  I mean, it
requires VSX, and that should skip the testcase already on Darwin?

> +/* { dg-require-effective-target dfp_hw } */

Why is that?  I don't see any dfp used here?

> +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { } } 
> */

You do not use -mcpu=, so why this dg-skip-if?  And please set
-mdejagnu-cpu= instead.

> --- gcc/testsuite/gcc.target/powerpc/fold-vec-extract-char.p8.c   
> (revision 269370)
> +++ gcc/testsuite/gcc.target/powerpc/fold-vec-extract-char.p8.c   
> (working copy)
> @@ -18,7 +18,7 @@
>  /* { dg-final { scan-assembler-times {\mxxpermdi\M} 3 { target lp64 } } } */
>  /* { dg-final { scan-assembler-times {\mvslo\M} 3 { target lp64 } } } */
>  /* { dg-final { scan-assembler-times {\mmfvsrd\M} 6 { target lp64 } } } */
> -/* { dg-final { scan-assembler-times {\msradi\M} 3 { target lp64 } } } */
> +/* { dg-final { scan-assembler-times {\msrdi\M} 3 { target lp64 } } } */

Maybe allow both?  So {\msra?di\M}?  Or does sradi make no sense here?

> --- gcc/testsuite/gcc.target/powerpc/vsx-builtin-14a.c(revision 0)
> +++ gcc/testsuite/gcc.target/powerpc/vsx-builtin-14a.c(revision 0)
> @@ -0,0 +1,128 @@
> +/* { dg-do run { target { powerpc*-*-* } } } */
> +/* { dg-skip-if "" { powerpc*-*-darwin* } } */
> +/* { dg-require-effective-target dfp_hw } */
> +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { } } 
> */
> +/* { dg-options "-maltivec" } */
> +
> +/* This test should run the same on any target that supports altivec/dfp
> +   instructions.  Intentionally not specifying cpu in order to test
> +   all code generation paths.  */

I don't see where dfp comes in?  For server CPUs it is p6 and up, the same
as AltiVec, but that is coincidental.

> --- gcc/testsuite/gcc.target/powerpc/pr87532-mc.c (revision 0)
> +++ gcc/testsuite/gcc.target/powerpc/pr87532-mc.c (revision 0)
> @@ -0,0 +1,260 @@
> +/* { dg-do run { target { powerpc*-*-* && lp64 } } } */

Check for int128, instead?  Or is there another reason to want lp64?

> +  __asm__ (" # %x0" : "+wa" (a));

"wa" requires VSX.

> --- gcc/testsuite/gcc.target/powerpc/vsx-builtin-14b.c(revision 0)
> +++ gcc/testsuite/gcc.target/powerpc/vsx-builtin-14b.c(revision 0)
> @@ -0,0 +1,128 @@
> +/* { dg-do run { target { powerpc*-*-* } } } */

Btw, you can just say { dg-do run }: everything in gcc.target/powerpc is
only run for powerpc*-*-* targets.

> --- gcc/testsuite/gcc.target/powerpc/vec-extract-v16qiu-v2b.c (revision 0)
> +++ gcc/testsuite/gcc.target/powerpc/vec-extract-v16qiu-v2b.c (revision 0)
> @@ -0,0 +1,16 @@
> +/* { dg-do run { target { powerpc*-*-linux* } } } */
> +/* { dg-skip-if "" { powerpc*-*-darwin* } } */
> +/* { dg-require-effective-target vsx_hw } */
> +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { } } 
> */
> +/* { dg-options "-O2 -mvsx -save-temps -dp -g" } */

I think that is debugging code left over?  -dp -g should be harmless, but
-save-temps is littering ;-)

> --- gcc/testsuite/gcc.target/powerpc/vec-extract-v16qiu-v2.h  (revision 0)
> +++ gcc/testsuite/gcc.target/powerpc/vec-extract-v16qiu-v2.h  (revision 0)

> +static vector TYPE
> +deoptimize (vector TYPE a)
> +{
> +  __asm__ (" # %x0" : "+wa" (a));
> +  return a;
> +}

Is this only ever compiled with VSX?  If not, use "v" instead?

> --- gcc/config/rs6000/rs6000-c.c  (revision 269370)
> +++ gcc/config/rs6000/rs6000-c.c  (working copy)
> @@ -6568,9 +6568,15 @@
> /* If the second argument is an integer constant, if the value is in
>the expected range, generate the built-in code if we can.  We need
>64-bit and direct move to extract the 

[PATCH, rs6000] PR87532: Bad Results from vec_extract(unsigned char, foo) dependent upon function inline

2019-03-08 Thread Kelvin Nilsen
This problem report, though initially motivated by differences in behavior 
between constant and non-constant selector arguments, uncovered a number of 
inconsistencies in the implementation of vec_extract.

This patch provides several fixes to make handling of constant selector 
expressions the same as the handling of non-constant selector expressions.  In 
the process of testing, it was observed that certain existing regression tests 
were looking for the wrong instructions to be emitted and those tests have been 
updated.

This has bootstrapped and tested without regressions on 
powerpc64le-unknown-linux (both P8 and P9) and on powerpc-linux (P7 big-endian, 
with both -m32 and -m64 target options).

Is this ok for trunk?

gcc/ChangeLog:

2019-03-08  Kelvin Nilsen  

PR target/87532
* config/rs6000/rs6000-c.c (altivec_resolve_overloaded_builtin):
When handling vec-extract, use modular arithmetic to allow
constant selectors greater than vector length.
* config/rs6000/rs6000.c (rs6000_expand_vector_extract): Allow
V1TImode vectors to have constant selector values greater than 0.
Use modular arithmetic to compute vector index.
(rs6000_split_vec_extract_var): Use modular arithmetic to compute
index for in-memory vectors.  Correct code generation for
in-register vectors.
(altivec_expand_vec_ext_builtin): Use modular arithmetic to
compute index.

gcc/testsuite/ChangeLog:

2019-03-08  Kelvin Nilsen  

PR target/87532
* gcc.target/powerpc/vsx-builtin-10a.c: New test.
* gcc.target/powerpc/vsx-builtin-20a.c: New test.
* gcc.target/powerpc/vsx-builtin-11b.c: New test.
* gcc.target/powerpc/vsx-builtin-9b.c: New test.
* gcc.target/powerpc/vsx-builtin-12a.c: New test.
* gcc.target/powerpc/vsx-builtin-13b.c: New test.
* gcc.target/powerpc/vsx-builtin-14a.c: New test.
* gcc.target/powerpc/vec-extract-v16qiu-v2a.c: New test.
* gcc.target/powerpc/vsx-builtin-15b.c: New test.
* gcc.target/powerpc/vsx-builtin-16a.c: New test.
* gcc.target/powerpc/vsx-builtin-17b.c: New test.
* gcc.target/powerpc/vsx-builtin-18a.c: New test.
* gcc.target/powerpc/pr87532-mc.c: New test.
* gcc.target/powerpc/vsx-builtin-19b.c: New test.
* gcc.target/powerpc/vsx-builtin-10b.c: New test.
* gcc.target/powerpc/vsx-builtin-11a.c: New test.
* gcc.target/powerpc/vsx-builtin-9a.c: New test.
* gcc.target/powerpc/vsx-builtin-20b.c: New test.
* gcc.target/powerpc/vsx-builtin-12b.c: New test.
* gcc.target/powerpc/vsx-builtin-13a.c: New test.
* gcc.target/powerpc/vsx-builtin-14b.c: New test.
* gcc.target/powerpc/vsx-builtin-15a.c: New test.
* gcc.target/powerpc/vec-extract-v16qiu-v2b.c: New test.
* gcc.target/powerpc/pr87532.c: New test.
* gcc.target/powerpc/vsx-builtin-16b.c: New test.
* gcc.target/powerpc/vec-extract-v16qiu-v2.h: New test.
* gcc.target/powerpc/vsx-builtin-17a.c: New test.
* gcc.target/powerpc/vsx-builtin-18b.c: New test.
* gcc.target/powerpc/vsx-builtin-19a.c: New test.
* gcc.target/powerpc/fold-vec-extract-char.p8.c: Modify expected
instruction selection.
* gcc.target/powerpc/fold-vec-extract-short.p8.c: Likewise.
* gcc.target/powerpc/fold-vec-extract-int.p8.c: Likewise.

Index: gcc/testsuite/gcc.target/powerpc/vsx-builtin-10a.c
===
--- gcc/testsuite/gcc.target/powerpc/vsx-builtin-10a.c  (revision 0)
+++ gcc/testsuite/gcc.target/powerpc/vsx-builtin-10a.c  (revision 0)
@@ -0,0 +1,157 @@
+/* { dg-do run { target { powerpc*-*-* } } } */
+/* { dg-skip-if "" { powerpc*-*-darwin* } } */
+/* { dg-require-effective-target dfp_hw } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { } } */
+/* { dg-options "-maltivec" } */
+
+/* This test should run the same on any target that supports altivec/dfp
+   instructions.  Intentionally not specifying cpu in order to test
+   all code generation paths.  */
+
+#include 
+
+extern void abort (void);
+
+#define CONST0 (0)
+#define CONST1 (1)
+#define CONST2 (2)
+#define CONST3 (3)
+#define CONST4 (4)
+#define CONST5 (5)
+#define CONST6 (6)
+#define CONST7 (7)
+
+
+/* Test that indices > length of vector are applied modulo the vector
+   length.  */
+
+/* Test for vector residing in register.  */
+short s3 (vector short v)
+{
+  return __builtin_vec_ext_v8hi (v, 3);
+}
+
+short s7 (vector short v)
+{
+  return __builtin_vec_ext_v8hi (v, 7);
+}
+
+short s21 (vector short v)
+{
+  return __builtin_vec_ext_v8hi (v, 21);
+}
+
+short s30 (vector short v)
+{
+  return __builtin_vec_ext_v8hi (v, 30);
+}
+
+/* Test for vector residing in memory.  */
+short ms3 (vector short *vp)
+{
+  return