#4223: LLVM slower then NCG, C example
-------------------------------+--------------------------------------------
    Reporter:  dterei          |       Owner:  dterei                 
        Type:  bug             |      Status:  new                    
    Priority:  normal          |   Component:  Compiler (LLVM)        
     Version:  6.12.3          |    Keywords:                         
          Os:  Linux           |    Testcase:                         
Architecture:  x86_64 (amd64)  |     Failure:  Runtime performance bug
-------------------------------+--------------------------------------------
 The following program is slower when compiled via the LLVM backend then
 when compiled via C or NCG.
 {{{
 {-# LANGUAGE BangPatterns #-}

 {-
     ghc 6.12.1 -O2
     1.752
 -}

 import Data.Vector.Storable
 import qualified Data.Vector.Storable as V
 import Foreign
 import Foreign.C.Types

 -- Define a 4 element vector type
 data Vec4 = Vec4 {-# UNPACK #-} !CFloat
                  {-# UNPACK #-} !CFloat
                  {-# UNPACK #-} !CFloat
                  {-# UNPACK #-} !CFloat

 ------------------------------------------------------------------------

 -- Ensure we can store it in an array
 instance Storable Vec4 where
   sizeOf _ = sizeOf (undefined :: CFloat) * 4
   alignment _ = alignment (undefined :: CFloat)

   {-# INLINE peek #-}
   peek p = do
              a <- peekElemOff q 0
              b <- peekElemOff q 1
              c <- peekElemOff q 2
              d <- peekElemOff q 3
              return (Vec4 a b c d)
     where
       q = castPtr p

   {-# INLINE poke #-}
   poke p (Vec4 a b c d) = do
                             pokeElemOff q 0 a
                             pokeElemOff q 1 b
                             pokeElemOff q 2 c
                             pokeElemOff q 3 d
     where
       q = castPtr p

 ------------------------------------------------------------------------

 a = Vec4 0.2 0.1 0.6 1.0
 m = Vec4 0.99 0.7 0.8 0.6

 add :: Vec4 -> Vec4 -> Vec4
 {-# INLINE add #-}
 add (Vec4 a b c d) (Vec4 a' b' c' d') = Vec4 (a+a') (b+b') (c+c') (d+d')

 mult :: Vec4 -> Vec4 -> Vec4
 {-# INLINE mult #-}
 mult (Vec4 a b c d) (Vec4 a' b' c' d') = Vec4 (a*a') (b*b') (c*c') (d*d')

 vsum :: Vec4 -> CFloat
 {-# INLINE vsum #-}
 vsum (Vec4 a b c d) = a+b+c+d

 multList :: Int -> Vector Vec4 -> Vector Vec4
 multList !count !src
     | count <= 0    = src
     | otherwise     = multList (count-1) $ V.map (\v -> add (mult v m) a)
 src

 main = do
     print $ Data.Vector.Storable.sum
           $ Data.Vector.Storable.map vsum
           $ multList repCount
           $ Data.Vector.Storable.replicate arraySize (Vec4 0 0 0 0)

 repCount, arraySize :: Int
 repCount = 10000
 arraySize = 20000
 }}}

 Timings under Linux/64bit:
 {{{
   * fasm: 1.502s
   * viac: 1.525s
   * llvm: 1.853s
 }}}

 This isn't universal though (as in across all targets), for example we get
 the following timings under these other targets:

 Windows/32bit:
 {{{
   * fasm: 3.279s
   * viac: 3.997s
   * llvm: 1.932s
 }}}

 As for if its the 32bit or the fact that its Windows isn't known yet.

-- 
Ticket URL: <http://hackage.haskell.org/trac/ghc/ticket/4223>
GHC <http://www.haskell.org/ghc/>
The Glasgow Haskell Compiler
_______________________________________________
Glasgow-haskell-bugs mailing list
[email protected]
http://www.haskell.org/mailman/listinfo/glasgow-haskell-bugs

Reply via email to