#2289: Needless reboxing of values when returning from a tight loop
-------------------------------------------+--------------------------------
    Reporter:  dons                        |       Owner:          
        Type:  run-time performance bug    |      Status:  new     
    Priority:  normal                      |   Component:  Compiler
     Version:  6.8.2                       |    Severity:  normal  
    Keywords:  boxing, loops, performance  |    Testcase:          
Architecture:  Unknown                     |          Os:  Unknown 
-------------------------------------------+--------------------------------
 GHC wants to box up strict values when returning from tight inner loops,
 even when they're
 immediately taken apart. This leads to redundant instructions in the
 bodies of tight loops,
 and more code.

 It affects, in particular, loops that result from fusion, which need to be
 tight, but often return multiple values via unlifted pairs.

 Consider this program:

 {{{
 {-# OPTIONS -fexcess-precision #-}
 {-# LANGUAGE TypeOperators #-}

 import System.Environment
 import Text.Printf
 import Data.Array.Vector

 mean :: UArr Double -> Double
 mean arr = s / fromIntegral l
   where
     s :*: l = foldlU k (0 :*: 0) arr :: (Double :*: Int)
     k (s :*: n) x = s+x :*: n+1

 main = do
     [d] <- map read `fmap` getArgs
     printf "%f\n" (mean (enumFromToFracU 1 d))
 }}}

 It generates this rather good Core (ghc 6.8.2):

 {{{
 $s$wfold_s1rB :: Double#
                -> Int#
                -> Double#
                -> (# Double, Int #)

 $s$wfold_s1rB =
 \ (sc_s1rr :: Double#)
   (sc1_s1rs :: Int#)
   (sc2_s1rt :: Double#) ->
   case >## sc_s1rr y_a1pr of wild4_X1no {
     False ->
       $s$wfold_s1rB
         (+## sc_s1rr 1.0)
         (+# sc1_s1rs 1)
         (+## sc2_s1rt sc_s1rr);
     True -> (# D# sc2_s1rt, I# sc1_s1rs #)
   };
 } in
 case $s$wfold_s1rB 2.0 1 1.0 of ww_s1qg { (# ww1_s1qi, ww2_s1qj #) ->
 case ww1_s1qi of wild4_a1mC { D# x_a1mE ->
 case ww2_s1qj of wild5_aP6 { I# x1_aP8 ->
 case /## x_a1mE (int2Double# x1_aP8)
 of wild21_a1mK { __DEFAULT ->
 D# wild21_a1mK
 }}}

 But note, what's this?

 {{{
     True -> (# D# sc2_s1rt, I# sc1_s1rs #)
   };
 } in
 case $s$wfold_s1rB 2.0 1 1.0 of ww_s1qg { (# ww1_s1qi, ww2_s1qj #) ->
 case ww1_s1qi of wild4_a1mC { D# x_a1mE ->
 case ww2_s1qj of wild5_aP6 { I# x1_aP8 ->
 case /## x_a1mE (int2Double# x1_aP8)

 }}}

 The return values of what was a strict pair are boxed, placed in an
 unboxed tuple,
 and then immediately unboxed and the division takes place.

 Ok, let's isolate this. Here, the boxed return, from the inner loop:

 {{{
 mean_s19V :: Double#
            -> Int#
            -> Double#
            -> (# Double, Int #)

 mean_s19V =
 \ (ds1_dD3 :: Double#)
   (ds2_dD4 :: Int#)
   (ds3_dD5 :: Double#) ->
   case >## ds1_dD3 d#_aoG of wild4_Xw {
     False ->
       mean_s19V
         (+## ds1_dD3 1.0)
         (+# ds2_dD4 1)
         (+## ds3_dD5 ds1_dD3);
     True -> (# D# ds3_dD5, I# ds2_dD4 #)
   };
 } in
 case mean_s19V 2.0 1 1.0 of wild4_Xr { (# ds1_dCV, ds2_dCW #) ->
 case ds1_dCV of wild5_Xv { D# x_aoR ->
 case ds2_dCW of wild6_Xy { I# y_aoS ->
 case /## x_aoR (int2Double# y_aoS) of wild7_XB { __DEFAULT ->
 D# wild7_XB
 }}}

 And the inner loop and exit:

 {{{
 s1bd_info:

   -- what's this stuff?
   leaq        32(%r12), %rax
   cmpq        %r15, %rax
   movq        %rax, %r12
   ja  .L17

   -- ok, to business:
   ucomisd     5(%rbx), %xmm5
   ja  .L19
   movapd      %xmm6, %xmm0
   leaq        -32(%rax), %r12
   incq        %rsi
   addsd       %xmm5, %xmm0
   addsd       .LC1(%rip), %xmm5
   movapd      %xmm0, %xmm6
   jmp s1bd_info


 .L19:
   movq        %rsi, -16(%rax)
   movq        $base_GHCziBase_Izh_con_info, -24(%rax)
   movq        $base_GHCziFloat_Dzh_con_info, -8(%rax)
   movsd       %xmm6, (%rax)
   leaq        -7(%rax), %rbx
   leaq        -23(%rax), %rsi
   jmp *(%rbp)
 }}}

 Now, I can avoid the reboxing manually:

 {{{
 mean_s19R :: Double#
            -> Int#
            -> Double#
            -> (# Double#, Int# #)

 mean_s19R =
 \ (ds1_dCZ :: Double#)
   (ds2_dD0 :: Int#)
   (ds3_dD1 :: Double#) ->
   case >## ds1_dCZ d#_aoG of wild4_Xw {
     False ->
       mean_s19R
         (+## ds1_dCZ 1.0)
         (+# ds2_dD0 1)
         (+## ds3_dD1 ds1_dCZ);
     True -> (# ds3_dD1, ds2_dD0 #)
   };
 } in
 case mean_s19R 2.0 1 1.0 of wild4_Xr { (# x_aoR, y_aoS #) ->
 case /## x_aoR (int2Double# y_aoS) of wild5_Xv { __DEFAULT ->
 D# wild5_Xv
 }}}

 And we get:

 {{{
 s1b9_info:
   -- hey , our junk is gone!

   ucomisd     5(%rbx), %xmm5
   ja  .L17
   movapd      %xmm6, %xmm0
   incq        %rsi
   addsd       %xmm5, %xmm0
   addsd       .LC1(%rip), %xmm5
   movapd      %xmm0, %xmm6
   jmp s1b9_info

 -- cool, that was it, let's go home:
 .L17:
   movapd      %xmm6, %xmm5
   movq        %rsi, %rbx
   jmp *(%rbp)

 }}}

 Which is a much better result. The loop is tighter.

 What can be done here?

-- 
Ticket URL: <http://hackage.haskell.org/trac/ghc/ticket/2289>
GHC <http://www.haskell.org/ghc/>
The Glasgow Haskell Compiler
_______________________________________________
Glasgow-haskell-bugs mailing list
[email protected]
http://www.haskell.org/mailman/listinfo/glasgow-haskell-bugs

Reply via email to