[Bug rtl-optimization/21676] [4.0/4.1/4.2 Regression] Optimizer regression: SciMark sparse matrix benchmark

2006-08-28 Thread uros at kss-loka dot si


--- Comment #10 from uros at kss-loka dot si  2006-08-29 06:12 ---
(In reply to comment #9)
> Fixed on the mainline by:
> http://gcc.gnu.org/ml/gcc-patches/2006-08/msg01036.html

Not really, the above patch fixed only one of three problems. The other two
remains, that is:

- ivopts problem (see comment #6)
- -march=pentium4 (see comment #8)

I'll try to see which option causes problems, described in #8.


-- 

uros at kss-loka dot si changed:

   What|Removed |Added

Summary|[4.0/4.1 Regression]|[4.0/4.1/4.2 Regression]
   |Optimizer regression:   |Optimizer regression:
   |SciMark sparse matrix   |SciMark sparse matrix
   |benchmark   |benchmark


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21676



[Bug rtl-optimization/21676] [4.0/4.1/4.2 Regression] Optimizer regression: SciMark sparse matrix benchmark

2006-08-17 Thread uros at kss-loka dot si


--- Comment #8 from uros at kss-loka dot si  2006-08-17 07:45 ---
Also interesting is, that -march=pentium4 produces following "de-optimized"
code, adding a couple more instructions and wasting %eax register:

.L8:
leal(%ebx,%ebx), %eax
movl40(%esp), %edx
movl(%edx,%eax,2), %edx
movl%edx, (%esp)
movl40(%esp), %edx
movl4(%edx,%eax,2), %ecx
movapd  %xmm2, %xmm1
cmpl%ecx, (%esp)
jge .L11
movl(%esp), %edx
.L12:

Some additiona timing can be shown (gcc-4.2 -O2 -fomit-frame-pointer): 

-march=pentium4: 0m2.756s
-march=pentium4 -fno-ivopts: 0m2.500s
-march=pentium4 -fno-ivopts -mfpmath=sse: 0m2.461s
-msse2 -fno-ivopts -mfmpath=sse: 0m2.311s

In the last case, the generated code is equal to gcc-3.2 generated one:

.L8:
movl36(%esp), %edx
movapd  %xmm2, %xmm1
movl(%edx,%ebx,4), %eax
movl4(%edx,%ebx,4), %ecx
cmpl%ecx, %eax
jge .L11
movl%eax, %edx
.p2align 4,,7
.L12:
movl(%edi,%edx,4), %eax
movsd   (%esi,%eax,8), %xmm0
mulsd   (%ebp,%edx,8), %xmm0
addl$1, %edx
cmpl%edx, %ecx
addsd   %xmm0, %xmm1
jg  .L12


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21676



[Bug rtl-optimization/21676] [4.0/4.1/4.2 Regression] Optimizer regression: SciMark sparse matrix benchmark

2006-08-17 Thread uros at kss-loka dot si


--- Comment #7 from uros at kss-loka dot si  2006-08-17 07:21 ---
(In reply to comment #6)

> I think that remaining time difference is due to strange loop above innermost:

... due to strange _header_ above innermost loop ...

The problem is that we load zero in both arms of "if".

This is what I get in .099t.optimized (using gcc-4.2 -O2 -fno-ivopts):

:;
  r.0 = (unsigned int) r;
  D.1556 = r.0 * 4;
  rowR = *((int *) D.1556 + row);
  rowRp1 = *((int *) D.1556 + row + 4B);
  if (rowR < rowRp1) goto ; else goto ;

:;
  sum = 0.0;
  goto  ();

:;
  i = rowR;
  sum = 0.0;

Assignment to sum should be moved before if...

SSE is able to somehow CSE zero load during RTL:

.L8:
movl 20(%ebp), %edx
movapd  %xmm2, %xmm1
movl (%edx,%ebx,4), %eax
movl 4(%edx,%ebx,4), %ecx
cmpl %ecx, %eax
jge .L11
movl %eax, %edx
.p2align 4,,7
.L12:


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21676



[Bug rtl-optimization/21676] [4.0/4.1/4.2 Regression] Optimizer regression: SciMark sparse matrix benchmark

2006-08-16 Thread uros at kss-loka dot si


--- Comment #6 from uros at kss-loka dot si  2006-08-16 12:15 ---
IMO the problem here is in IVopts. Using gcc-3.x, the innermost loop compiles
to:

.L15:
movl(%edi,%edx,4), %eax
fldl(%ebp,%edx,8)
addl$1, %edx
fmull   (%esi,%eax,8)
cmpl%ecx, %edx
faddp   %st, %st(1)
jl  .L15

and with current SVN gcc-4.2 into:

.L12:
movl(%ecx), %eax
fldl(%ebp,%eax,8)
fmull   (%edx)
faddp   %st, %st(1)
addl$1, %ebx
addl$4, %ecx
addl$8, %edx
cmpl%esi, %ebx
jne .L12

Adding -fno-ivopts, this loop gets compiled into:

.L12:
movl(%edi,%edx,4), %eax
fldl(%esi,%eax,8)
fmull   (%ebp,%edx,8)
faddp   %st, %st(1)
addl$1, %edx
cmpl%edx, %ecx
jg  .L12

Timings (-O3 -march=pentium4 -fomit-frame-pointer):

gcc-3.2: 0m2.301s
gcc-4.2: 0m2.713s
gcc-4.2 + -fno-ivopts: 0m2.473s

with:

gcc version 3.2 20020903 (Red Hat Linux 8.0 3.2-7)
gcc version 4.2.0 20060816 (experimental)

I think that remaining time difference is due to strange loop above innermost:
gcc-3.2:

fld %st(0)
.L16:
movl36(%esp), %eax
fld %st(0)
movl4(%eax,%ebx,4), %ecx
movl(%eax,%ebx,4), %edx
cmpl%ecx, %edx
jge .L23
.L15:
movl(%edi,%edx,4), %eax
fldl(%ebp,%edx,8)
addl$1, %edx
fmull   (%esi,%eax,8)
cmpl%ecx, %edx
faddp   %st, %st(1)
jl  .L15
.L23:
movl28(%esp), %eax
fstpl   (%eax,%ebx,8)
addl$1, %ebx
cmpl24(%esp), %ebx
jl  .L16


gcc-4.2:

.L8:
movl36(%esp), %edx
movl(%edx,%edi,4), %eax
movl4(%edx,%edi,4), %esi
fldz
cmpl%esi, %eax
jge .L11
fstp%st(0)
movl40(%esp), %ebx
leal(%ebx,%eax,4), %ecx
movl32(%esp), %ebx
leal(%ebx,%eax,8), %edx
fldz
xorl%ebx, %ebx
subl%eax, %esi
.L12:
movl(%ecx), %eax
fldl(%ebp,%eax,8)
fmull   (%edx)
faddp   %st, %st(1)
addl$1, %ebx
addl$4, %ecx
addl$8, %edx
cmpl%esi, %ebx
jne .L12
.L11:
movl28(%esp), %eax
fstpl   (%eax,%edi,8)
addl$1, %edi
cmpl24(%esp), %edi
jne .L8


and gcc-4.2 -fno-ivopts:

.L8:
leal(%ebx,%ebx), %eax
movl40(%esp), %edx
movl(%edx,%eax,2), %edx
movl%edx, (%esp)
movl40(%esp), %edx
movl4(%edx,%eax,2), %ecx
fldz
cmpl%ecx, (%esp)
jge .L11
fstp%st(0)
movl(%esp), %edx
fldz
.L12:
movl(%edi,%edx,4), %eax
fldl(%esi,%eax,8)
fmull   (%ebp,%edx,8)
faddp   %st, %st(1)
addl$1, %edx
cmpl%edx, %ecx
jg  .L12
.L11:
movl32(%esp), %ecx
fstpl   (%ecx,%ebx,8)
addl$1, %ebx
cmpl%ebx, 28(%esp)
jg  .L8


-- 

uros at kss-loka dot si changed:

   What|Removed |Added

 CC||uros at kss-loka dot si
 Status|UNCONFIRMED |NEW
 Ever Confirmed|0   |1
   Last reconfirmed|-00-00 00:00:00 |2006-08-16 12:15:56
   date||


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21676



[Bug rtl-optimization/21676] [4.0/4.1/4.2 Regression] Optimizer regression: SciMark sparse matrix benchmark

2006-08-15 Thread pinskia at gcc dot gnu dot org


--- Comment #5 from pinskia at gcc dot gnu dot org  2006-08-16 06:50 ---
Can someone try the mainline again after Paolo B.'s patch?


-- 

pinskia at gcc dot gnu dot org changed:

   What|Removed |Added

   Keywords||missed-optimization


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21676



[Bug rtl-optimization/21676] [4.0/4.1/4.2 Regression] Optimizer regression: SciMark sparse matrix benchmark

2006-07-10 Thread rguenth at gcc dot gnu dot org


--- Comment #4 from rguenth at gcc dot gnu dot org  2006-07-10 12:45 ---
I get on a Pentium 4, -O3 -march=pentium4 -fomit-frame-pointer -o bench
Random.i SparseCompRow.i array.i kernel.i main.i

3.4.6: 3.48s 
4.0.3: 4.44s
4.1.1: 4.12s
4.2.0: 4.13s


-- 

rguenth at gcc dot gnu dot org changed:

   What|Removed |Added

 CC||rguenth at gcc dot gnu dot
   ||org


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21676



[Bug rtl-optimization/21676] [4.0/4.1/4.2 Regression] Optimizer regression: SciMark sparse matrix benchmark

2006-06-06 Thread gcc at pdoerfler dot com


--- Comment #3 from gcc at pdoerfler dot com  2006-06-06 11:22 ---
I get the following with -O3 -march=pentium4 -fomit-frame-pointer on a pentium4
gentoo machine:

gcc-3.4.6   gcc-4.0.2   gcc-4.1.1
2.69s   4.14s   3.26s

These are all with gentoo's patches.
Also, current mainline is the same as gcc-4.1.1

I can confirm that the difference without -fomit-frame-pointer is much smaller.
In fact, 3.4.6 and 4.1.1 are almost the same without it. 


-- 

gcc at pdoerfler dot com changed:

   What|Removed |Added

 CC||gcc at pdoerfler dot com


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21676



[Bug rtl-optimization/21676] [4.0/4.1/4.2 Regression] Optimizer regression: SciMark sparse matrix benchmark

2006-06-04 Thread pinskia at gcc dot gnu dot org


--- Comment #2 from pinskia at gcc dot gnu dot org  2006-06-04 20:06 ---
It would be nice if we could get 4.1.x numbers.


-- 

pinskia at gcc dot gnu dot org changed:

   What|Removed |Added

   GCC host triplet|i686-pc-linux-gnu   |
 GCC target triplet||i686-pc-linux-gnu


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21676



[Bug rtl-optimization/21676] [4.0/4.1/4.2 Regression] Optimizer regression: SciMark sparse matrix benchmark

2006-06-04 Thread jsm28 at gcc dot gnu dot org


-- 

jsm28 at gcc dot gnu dot org changed:

   What|Removed |Added

Summary|Optimizer regression:   |[4.0/4.1/4.2 Regression]
   |SciMark sparse matrix   |Optimizer regression:
   |benchmark   |SciMark sparse matrix
   ||benchmark
   Target Milestone|--- |4.1.2


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21676