[Bug middle-end/60960] New: Wrong result when a vector variable is divided by a literal constant

2014-04-25 Thread uranus at tinlans dot org
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60960

Bug ID: 60960
   Summary: Wrong result when a vector variable is divided by a
literal constant
   Product: gcc
   Version: 4.8.2
Status: UNCONFIRMED
  Severity: normal
  Priority: P3
 Component: middle-end
  Assignee: unassigned at gcc dot gnu.org
  Reporter: uranus at tinlans dot org

 gcc -v
Using built-in specs.
COLLECT_GCC=/usr/x86_64-pc-linux-gnu/gcc-bin/4.8.2/gcc
COLLECT_LTO_WRAPPER=/usr/libexec/gcc/x86_64-pc-linux-gnu/4.8.2/lto-wrapper
Target: x86_64-pc-linux-gnu
Configured with: /var/tmp/portage/sys-devel/gcc-4.8.2/work/gcc-4.8.2/configure
--prefix=/usr --bindir=/usr/x86_64-pc-linux-gnu/gcc-bin/4.8.2
--includedir=/usr/lib/gcc/x86_64-pc-linux-gnu/4.8.2/include
--datadir=/usr/share/gcc-data/x86_64-pc-linux-gnu/4.8.2
--mandir=/usr/share/gcc-data/x86_64-pc-linux-gnu/4.8.2/man
--infodir=/usr/share/gcc-data/x86_64-pc-linux-gnu/4.8.2/info
--with-gxx-include-dir=/usr/lib/gcc/x86_64-pc-linux-gnu/4.8.2/include/g++-v4
--host=x86_64-pc-linux-gnu --build=x86_64-pc-linux-gnu --disable-altivec
--disable-fixed-point --without-cloog --disable-lto --enable-nls
--without-included-gettext --with-system-zlib --enable-obsolete
--disable-werror --enable-secureplt --enable-multilib
--with-multilib-list=m32,m64 --enable-libmudflap --disable-libssp
--enable-libgomp
--with-python-dir=/share/gcc-data/x86_64-pc-linux-gnu/4.8.2/python
--enable-checking=release --enable-java-awt=gtk --enable-libstdcxx-time
--enable-objc-gc --enable-languages=c,c++,java,objc,obj-c++,fortran
--enable-shared --enable-threads=posix --enable-__cxa_atexit
--enable-clocale=gnu --enable-targets=all --with-bugurl=http://bugs.gentoo.org/
--with-pkgversion='Gentoo 4.8.2 p1.0, pie-0.5.8'
Thread model: posix
gcc version 4.8.2 (Gentoo 4.8.2 p1.0, pie-0.5.8)

---
Example code (2 files to avoid the interference from the automatic inline
feature):

/* vec.c */
typedef unsigned char v4qi __attribute__ ((vector_size (4)));

v4qi f1 (v4qi v);
v4qi f2 (v4qi v);
v4qi f3 (v4qi x, v4qi y);
void print (v4qi v);

int
main ()
{
  v4qi x = { 5, 5, 5, 5 };
  v4qi y = { 2, 2, 2, 2 };
  v4qi z;

  z = f1 (x);
  print (z);

  z = f2 (x);
  print (z);

  z = f3 (x, y);
  print (z);

  return 0;
}

/* vec-impl.c */
#include stdio.h

typedef unsigned char v4qi __attribute__ ((vector_size (4)));

v4qi
f1 (v4qi v)
{
  return v / 2;
}

v4qi
f2 (v4qi v)
{
  return v / (v4qi) { 2, 2, 2, 2 };
}

v4qi
f3 (v4qi x, v4qi y)
{
  return x / y;
}

void
print (v4qi v)
{
  printf (%d %d %d %d\n, v[3], v[2], v[1], v[0]);
}

---
Command line:
 gcc -O3 -c vec.c
 gcc -O3 -c vec-impl.c
 gcc -O3 vec.o vec-impl.o -o test
 ./test

Output:
2 130 130 130
2 130 130 130
2 2 2 2
---

Although the target doesn't support this operation, I remember GCC is able to
expand it to proper scalar operations.
I expected all of the 3 outputs should be identical, but the results returned
by f1 () and f2 () are wrong.
The whole vector is treated as an integer variable and right shifted by 1 in f1
() and f2 ().
By using the command gcc -O3 -fdump-tree-all -da -S vec-impl.c, we can see
the assembly code of f1 () and f2 () is wrong:
f1:
.LFB24:
.cfi_startproc  
movl%edi, %eax
shrl%eax
ret
.cfi_endproc
.LFE24:
.size   f1, .-f1  
.p2align 4,,15   
.globl  f2
.type   f2, @function
I don't show the assembly code of f2 () because is the same.

Here is the RTL expansion result of the function f1 () in the file
vec-impl.c.166r.expand:
(note 4 1 2 2 [bb 2] NOTE_INSN_BASIC_BLOCK)
(insn 2 4 3 2 (set (reg/v:SI 61 [ v ])
(reg:SI 5 di [ v ])) vec-impl.c:7 -1
 (nil))
(note 3 2 6 2 NOTE_INSN_FUNCTION_BEG)
(insn 6 3 7 2 (parallel [
(set (reg:SI 62 [ D.2425 ])
(lshiftrt:SI (reg/v:SI 61 [ v ])
(const_int 1 [0x1])))
(clobber (reg:CC 17 flags))
]) vec-impl.c:8 -1
 (nil))
(insn 7 6 11 2 (set (reg:SI 60 [ retval ])
(reg:SI 62 [ D.2425 ])) vec-impl.c:8 -1
 (nil))
(insn 11 7 14 2 (set (reg/i:SI 0 ax)
(reg:SI 60 [ retval ])) vec-impl.c:9 -1
 (nil))
(insn 14 11 0 2 (use (reg/i:SI 0 ax)) vec-impl.c:9 -1
 (nil))

And here is its corresponding GIMPLE:
f1 (v4qi v)
{
  vector(4) unsigned char _4;

;;   basic block 2, loop depth 0
;;pred:   ENTRY
  _4 = v_1(D)  1;
  return _4;
;;succ:   EXIT

}

I'm not sure whether it's correct or not; anyway, I can make sure the
transformation was done by the veclower pass.
We could see it was a vector operation in the file vec-impl.c.121t.loopdone:
;; Function f1 (f1, funcdef_no=24, decl_uid=2380, cgraph_uid=24)

f1 (v4qi v)
{
  v4qi _2;

  bb 2:
  _2 = v_1(D) / { 2, 2, 2, 2 };
  return _2;

}
And it was altered in the file vec-impl.c.122t.veclower21:
;; Function f1 (f1, funcdef_no=24, decl_uid=2380, cgraph_uid=24)

f1

[Bug rtl-optimization/58295] [4.8/4.9 regression] Missed zero-extension elimination in the combiner

2013-09-06 Thread uranus at tinlans dot org
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58295

--- Comment #4 from Ling-hua Tseng uranus at tinlans dot org ---
(In reply to Jakub Jelinek from comment #3)
 So perhaps you should just look at combiner dump and see what insns it tried
 and failed to match and see if you couldn't add some of them into the
 affected backends.

It's exactly what I did. Unfortunately, the combinder doesn't give any other
chance to eliminate that redundant zero extension. The cases tried by the
combinder are:
1. (set (reg:SI) (zero_extend:SI (plus:QI (mem:QI) (const_int
2. (set (reg:QI) (plus:QI (mem:QI) (const_int)))
3. (set (reg:QI) (plus:QI (subreg:QI) (const_int)))
4. (set (reg:CC) (compare:CC (subreg:QI) (const_int)))
5. (set (reg:CC) (compare:CC (plus:QI (mem:QI) (const_int
6. (set (reg:SI) (leu:SI (subreg:QI) (const_int)))
7. (set (reg:SI) (leu:SI (subreg:QI) (const_int)))
8. (set (reg:SI) (leu:SI (plus:QI ...)))

You know 1  2 are impossible to most RISC targets, and making all other ones
recognizable is lying GCC that your target supports QImode
arithmetic/comparison. Telling GCC a lie here will result in some code
generation bugs. For example, you will find a fail case in
gcc/testsuite/gcc.c-torture/execute/980617-1.c while you are running a test if
you provide a QImode comparison in the machine description. Here is the source
code of that test case:
void foo (unsigned int * p)
{
  if ((signed char)(*p  0xFF) == 17 || (signed char)(*p  0xFF) == 18)
return;
  else
abort ();
}

int main ()
{
  int i = 0x30011;
  foo(i);
  exit (0);
}

The MSB 16 bits contain 0x0003, and the LSB 16 bits contain 0x0011. Using -O3
to compile this code, you will find that GCC simplifies the expression '(signed
char)(*p  0xFF) == 17 || (signed char)(*p  0xFF) == 18' to an SImode
subtraction and a QImode comparison.The result is incorrect, because the target
only supports SImode comparisons, i.e., you actually generate an SImode
hardware instruction for the pattern of a QImode comparison, and the MSB 16-bit
is still dirty. Hence 3 ~ 8 are not the ones we can match them in the RTL
combination pass.

Therefore, we can conclude that the original case tried by the combiner is the
best way to merge/reduce the redundant zero extension insn.


[Bug rtl-optimization/58295] New: The combination pass doesn't eliminates some extra zero extensions

2013-09-02 Thread uranus at tinlans dot org
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58295

Bug ID: 58295
   Summary: The combination pass doesn't eliminates some extra
zero extensions
   Product: gcc
   Version: 4.9.0
Status: UNCONFIRMED
  Severity: normal
  Priority: P3
 Component: rtl-optimization
  Assignee: unassigned at gcc dot gnu.org
  Reporter: uranus at tinlans dot org

$ cat test.c
extern char zeb_test_array[10];

unsigned char ee_isdigit2(unsigned int i)
{
  unsigned char c = zeb_test_array[i];
  unsigned char retval;

  retval = ((c='0')  (c='9')) ? 1 : 0;
  return retval;
}

$ arm-eabi-gcc -v
Using built-in specs.
COLLECT_GCC=arm-eabi-gcc
COLLECT_LTO_WRAPPER=/home1/lhtseng/arm/4.9/libexec/gcc/arm-eabi/4.9.0/lto-wrapper
Target: arm-eabi
Configured with: ../../../../work/4.9/src/gcc-4.9.0/configure --target=arm-eabi
--prefix=/home1/lhtseng/arm/4.9 --disable-nls --disable-shared
--enable-languages=c --enable-__cxa_atexit --enable-c99 --enable-long-long
--enable-threads=single --with-newlib --disable-multilib --disable-libssp
--disable-libgomp --disable-decimal-float --disable-libffi --disable-libmudflap
--disable-lto --with-gmp=/home1/lhtseng/work/general
--with-mpfr=/home1/lhtseng/work/general --with-mpc=/home1/lhtseng/work/general
--with-isl=/home1/lhtseng/work/general --with-cloog=/home1/lhtseng/work/general
Thread model: single
gcc version 4.9.0 20130802 (experimental) (GCC) 

$ arm-eabi-gcc -O3 -S test.c
$ cat test.s
...
ee_isdigit2:
@ Function supports interworking.
@ args = 0, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
@ link register save eliminated.
ldr r3, .L2
ldrbr0, [r3, r0]@ zero_extendqisi2
sub r0, r0, #48
and r0, r0, #255
cmp r0, #9
movhi   r0, #0
movls   r0, #1
bx  lr
...

The instruction 'and r0, r0, #255' is a redundant instruction which cannot be
eliminated by the RTL instruction combination pass. This pass was able to
handle this case before this commit:
http://gcc.gnu.org/viewcvs/gcc/trunk/gcc/simplify-rtx.c?r1=191909r2=191928pathrev=192303
And the code was re-organized to line 643 ~ 656 after this commit:
http://gcc.gnu.org/viewcvs/gcc/trunk/gcc/simplify-rtx.c?r1=192006r2=192186pathrev=192303
For example, GCC 4.6.3 can handle it perfectly.

In GCC 4.9.0, reverting the two commits or simply commeting the lines mentioned
above can make the combination pass handle this case again:
$ arm-eabi-gcc-modified -O3 -da -S test.c
$ cat test.c.166r.expand
...
(insn 9 8 10 2 (set (reg:SI 120)
(plus:SI (subreg:SI (reg:QI 118) 0)
(const_int -48 [0xffd0]))) test.c:6 -1
 (nil))
(insn 10 9 11 2 (set (reg:SI 121)
(and:SI (reg:SI 120)
(const_int 255 [0xff]))) test.c:6 -1
 (nil))
(insn 11 10 12 2 (set (reg:CC 100 cc)
(compare:CC (reg:SI 121)
(const_int 9 [0x9]))) test.c:6 -1
 (nil))
(insn 12 11 13 2 (set (reg:SI 122)
(leu:SI (reg:CC 100 cc)
(const_int 0 [0]))) test.c:6 -1
 (nil))
...
$ cat test.c.197r.combine
...
Trying 9, 10 - 11:
Failed to match this instruction:
(set (reg:CC 100 cc)
(compare:CC (plus:SI (reg:SI 119)
(const_int -48 [0xffd0]))
(const_int 9 [0x9])))
Successfully matched this instruction:
(set (reg:SI 121)
(plus:SI (reg:SI 119)
(const_int -48 [0xffd0])))
Successfully matched this instruction:
(set (reg:CC 100 cc)
(compare:CC (reg:SI 121)
(const_int 9 [0x9])))
deferring deletion of insn with uid = 9.
modifying insn i210: r121:SI=r119:SI-0x30
  REG_DEAD r119:SI
deferring rescan insn with uid = 10.
modifying insn i311: cc:CC=cmp(r121:SI,0x9)
  REG_DEAD r121:SI
deferring rescan insn with uid = 11.
...

The insn 10 is generated by (define_expand zero_extendqisi2 ...) of ARM's
machine description. Before the commits I mentioned above, the combination pass
successfully combines it with the insn 9. However, after those commits, the
combination pass never tries to do the combination '9, 10 - 11.'

After reading the commit messages of the file 'simplify-rtx.c', we can
understand the commits, r191928, was trying to optimize x86 code generation,
but it led to the suboptimal code generation of the ARM's target.


[Bug tree-optimization/58296] New: ivopts is unable to handle some loops altered by the loop header copying pass

2013-09-02 Thread uranus at tinlans dot org
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58296

Bug ID: 58296
   Summary: ivopts is unable to handle some loops altered by the
loop header copying pass
   Product: gcc
   Version: 4.9.0
Status: UNCONFIRMED
  Severity: normal
  Priority: P3
 Component: tree-optimization
  Assignee: unassigned at gcc dot gnu.org
  Reporter: uranus at tinlans dot org

$ cat test.c
void bne_loop(unsigned int val,unsigned int N)
{
  int i;

  for (i=0;iN;++i)
printf(%d\n,val+i);
}

Please note that the comparison expression in the for loop, 'i  N', is a
comparison between a signed int variable and an unsigned int variable. If we
change the type of i from 'int' to 'unsigned int', the issue won't be occured.

$ arm-eabi-gcc -v
Using built-in specs.
COLLECT_GCC=arm-eabi-gcc
COLLECT_LTO_WRAPPER=/home1/lhtseng/arm/4.9/libexec/gcc/arm-eabi/4.9.0/lto-wrapper
Target: arm-eabi
Configured with: ../../../../work/4.9/src/gcc-4.9.0/configure --target=arm-eabi
--prefix=/home1/lhtseng/arm/4.9 --disable-nls --disable-shared
--enable-languages=c --enable-__cxa_atexit --enable-c99 --enable-long-long
--enable-threads=single --with-newlib --disable-multilib --disable-libssp
--disable-libgomp --disable-decimal-float --disable-libffi --disable-libmudflap
--disable-lto --with-gmp=/home1/lhtseng/work/general
--with-mpfr=/home1/lhtseng/work/general --with-mpc=/home1/lhtseng/work/general
--with-isl=/home1/lhtseng/work/general --with-cloog=/home1/lhtseng/work/general
Thread model: single
gcc version 4.9.0 20130802 (experimental) (GCC) 

$ arm-eabi-gcc -O3 -fdump-tree-all -O3 -da -S test.c
$ cat -n test.s
...
27  .L3:
28  add r1, r1, r5
29  add r4, r4, #1
30  ldr r0, .L9
31  bl  printf
32  cmp r4, r6
33  mov r1, r4
34  bne .L3
...

The instruction 'mov r1, r4' is redundant. Reading the dump of the RTL
generation pass can understand how it's expanded:

$ cat test.c.166r.expand
...
;; i.0_4 = (unsigned int) i_9;

(insn 20 19 0 (set (reg:SI 110 [ i.0 ])
(reg/v:SI 112 [ i ])) ../test.c:6 -1
 (nil))
...

$ cat test.c.165t.optimized
...
  bb 4:
  # i_13 = PHI i_9(5), 0(3)
  # i.0_16 = PHI i.0_4(5), 0(3)
  _7 = i.0_16 + val_6(D);
  printf (%d\n, _7);
  i_9 = i_13 + 1;
  i.0_4 = (unsigned int) i_9;
  if (i_9 != _15)
goto bb 5;
  else
goto bb 6;
...

It's surprised that the line 'i.0_4 = (unsigned int) i_9;' cannot be handled by
any tree-level optimization passes and RTL level optimization passes. After
doing some investigations, we finally find that using '-Os' or '-fno-tree-ch'
instead of '-O3' can generate the optimized code, and the conversion was
eliminated by ivopts properly:
$ arm-eabi-gcc -O3 -fdump-tree-all -O3 -fno-tree-ch -da -S test.c
$ cat test.c.119t.ivopts
   bb 3:
  _7 = ivtmp.9_11;
  printf (%d\n, _7);
  ivtmp.9_10 = ivtmp.9_11 + 1;

  bb 4:
  # ivtmp.9_11 = PHI val_6(D)(2), ivtmp.9_10(3)
  if (ivtmp.9_11 != _12)
goto bb 3;
  else
goto bb 5;

$ cat test.s
...
.L3:
mov r1, r4
bl  printf
add r4, r4, #1
.L2:
cmp r4, r5
ldr r0, .L6
bne .L3
ldmfd   sp!, {r3, r4, r5, lr}
bx  lr
...

Therefore, it's believed that there are something wrong with ivopts, which is
unable to handle the loop altered by the tree-ch pass when there is a
comparison (int v.s. unsigned int) in the condition field of a FOR statement.