There is a huge performance regression due to the recip tree pass. A problem was
found in povray-3.6.1, where -ffast-math was 3 times slower (for -mfpmath=sse or
-mfpmath=387).

The problem was traced to POVFPU_RunDefault function that was found to be more
than 20 (twenty!) times slower:

with -ffast-math:

Each sample counts as 0.00195312 seconds.
  %   cumulative   self              self     total           
 time   seconds   seconds    calls   s/call   s/call  name    
 75.69     41.10    41.10  2622181     0.00     0.00 
pov::POVFPU_RunDefault(unsigned)
  2.68     42.55     1.46  3849666     0.00     0.00 
pov::Intersect_Light_Tree(pov::Ray_Struct*, pov::Project_Tree_Node_Struct*, int,
int, pov::istk_entry*, pov::Object_Struct**, pov::Light_Source_Struct*)
 
without -ffast-math:

Each sample counts as 0.00195312 seconds.
  %   cumulative   self              self     total           
 time   seconds   seconds    calls   s/call   s/call  name    
 12.04      1.85     1.85  2622153     0.00     0.00 
pov::POVFPU_RunDefault(unsigned)
  9.01      3.23     1.38  3849666     0.00     0.00 
pov::Intersect_Light_Tree(pov::Ray_Struct*, pov::Project_Tree_Node_Struct*, int,
int, pov::istk_entry*, pov::Object_Struct**, pov::Light_Source_Struct*)
  7.25      4.34     1.11   190627     0.00     0.00  pov::sbisect(int, pov::p*,
double, double, int, int, double*)

Attached testcase, that was reduced from actual source
(povray-3.6.1/source/fnpovfpu.cpp, POVFPU_RunDefault()), shows the problem with
recip pass:

gcc -O2 -ffast-math:

<bb 0>:

  # pc_17 = PHI <0(0), pc_33(322)>;
  # r7_15 = PHI <r7_20(0), r7_16(322)>;
  # r6_13 = PHI <r6_21(0), r6_14(322)>;
  # r5_11 = PHI <r5_22(0), r5_12(322)>;
  # r4_9 = PHI <r4_23(0), r4_10(322)>;
  # r3_7 = PHI <r3_24(0), r3_8(322)>;
  # r2_5 = PHI <r2_25(0), r2_6(322)>;
  # r1_3 = PHI <r1_26(0), r1_4(322)>;
  # r0_1 = PHI <r0_27(0), r0_2(322)>;
<L0>:;
  reciptmp.51_38 = 1.0e+0 / r7_15;
  reciptmp.52_35 = 1.0e+0 / r6_13;
  reciptmp.53_34 = 1.0e+0 / r5_11;
  reciptmp.54_43 = 1.0e+0 / r4_9;
  reciptmp.55_40 = 1.0e+0 / r3_7;
  reciptmp.56_39 = 1.0e+0 / r2_5;
  reciptmp.57_48 = 1.0e+0 / r1_3;
  reciptmp.58_45 = 1.0e+0 / r0_1;
  D.1619_28 = pc_17 * 4;
  D.1620_29 = (unsigned int *) D.1619_28;
  D.1622_31 = *D.1620_29;
  D.1623_32 = D.1622_31 & 4095;
  switch (D.1623_32)
    {
      case 0: goto <L1>;
      case 1: goto <L2>;
      case 2: goto <L3>;
      case 3: goto <L4>;
      case 4: goto <L5>;
      case 5: goto <L6>;
      case 6: goto <L7>;
      case 7: goto <L8>;
      case 8: goto <L9>;
      case 9: goto <L10>;
      case 10: goto <L11>;
      ...

As shown, recip pass moves _eight_ divisions in front of switch. Now, for
_every_ entry of switch, all eight divisions are performed. The produced code is
horrible:

-mfpmath=sse:

POVFPU_RunDefault:
        pushl   %ebp
        xorl    %edx, %edx
        movl    %esp, %ebp
        subl    $64, %esp
        fldz
        .p2align 4,,15
.L4:
        movl    0(,%edx,4), %eax
        movsd   .LC1, %xmm7
        movsd   .LC1, %xmm6
        movsd   .LC1, %xmm5
        movsd   .LC1, %xmm4
        movsd   .LC1, %xmm3
        movsd   .LC1, %xmm2
        movsd   .LC1, %xmm1
        movsd   .LC1, %xmm0
        divsd   -8(%ebp), %xmm7
        divsd   -16(%ebp), %xmm6
        divsd   -24(%ebp), %xmm5
        andl    $4095, %eax
        divsd   -32(%ebp), %xmm4
        divsd   -40(%ebp), %xmm3
        cmpl    $319, %eax
        divsd   -48(%ebp), %xmm2
        divsd   -56(%ebp), %xmm1
        divsd   -64(%ebp), %xmm0
        ja      .L5
        jmp     *.L326(,%eax,4)
        .section        .rodata
        .align 4
        .align 4
.L326:
        .long   .L6
        .long   .L7
        .long   .L8
        .long   .L9
...

for -mfpmath=387, the code is even worse, as every clause has a pack of fstp's
that clear results of unneeded divisions at the beginning.

.L9:
    fstp %st(0)
    fstp %st(0)
    fstp %st(0)
    fstp %st(0)
    fstp %st(0)
    fstp %st(0)
    fstp %st(0)
    fstp %st(0)
    fldl -40(%ebp)
    incl %edx
    faddl -64(%ebp)
    fstpl -40(%ebp)
    jmp .L4

-- 
           Summary: Massive performance regression for -ffast-math due to
                    the recip tree pass
           Product: gcc
           Version: 4.1.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P2
         Component: tree-optimization
        AssignedTo: unassigned at gcc dot gnu dot org
        ReportedBy: uros at kss-loka dot si
                CC: gcc-bugs at gcc dot gnu dot org


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=24123

Reply via email to