Re: [GHC] #4223: LLVM slower then NCG, C example

GHC Mon, 26 Jul 2010 05:21:48 -0700

#4223: LLVM slower then NCG, C example
-------------------------------+--------------------------------------------
    Reporter:  dterei          |       Owner:  dterei                 
        Type:  bug             |      Status:  new                    
    Priority:  normal          |   Component:  Compiler (LLVM)        
     Version:  6.12.3          |    Keywords:                         
          Os:  Linux           |    Testcase:                         
Architecture:  x86_64 (amd64)  |     Failure:  Runtime performance bug
-------------------------------+--------------------------------------------
Description changed by dterei:


Old description:

> The following program is slower when compiled via the LLVM backend then
> when compiled via C or NCG.
> {{{
> {-# LANGUAGE BangPatterns #-}
>
> {-
>     ghc 6.12.1 -O2
>     1.752
> -}
>
> import Data.Vector.Storable
> import qualified Data.Vector.Storable as V
> import Foreign
> import Foreign.C.Types
>
> -- Define a 4 element vector type
> data Vec4 = Vec4 {-# UNPACK #-} !CFloat
>                  {-# UNPACK #-} !CFloat
>                  {-# UNPACK #-} !CFloat
>                  {-# UNPACK #-} !CFloat
>
> ------------------------------------------------------------------------
>
> -- Ensure we can store it in an array
> instance Storable Vec4 where
>   sizeOf _ = sizeOf (undefined :: CFloat) * 4
>   alignment _ = alignment (undefined :: CFloat)
>
>   {-# INLINE peek #-}
>   peek p = do
>              a <- peekElemOff q 0
>              b <- peekElemOff q 1
>              c <- peekElemOff q 2
>              d <- peekElemOff q 3
>              return (Vec4 a b c d)
>     where
>       q = castPtr p
>
>   {-# INLINE poke #-}
>   poke p (Vec4 a b c d) = do
>                             pokeElemOff q 0 a
>                             pokeElemOff q 1 b
>                             pokeElemOff q 2 c
>                             pokeElemOff q 3 d
>     where
>       q = castPtr p
>
> ------------------------------------------------------------------------
>
> a = Vec4 0.2 0.1 0.6 1.0
> m = Vec4 0.99 0.7 0.8 0.6
>
> add :: Vec4 -> Vec4 -> Vec4
> {-# INLINE add #-}
> add (Vec4 a b c d) (Vec4 a' b' c' d') = Vec4 (a+a') (b+b') (c+c') (d+d')
>
> mult :: Vec4 -> Vec4 -> Vec4
> {-# INLINE mult #-}
> mult (Vec4 a b c d) (Vec4 a' b' c' d') = Vec4 (a*a') (b*b') (c*c') (d*d')
>
> vsum :: Vec4 -> CFloat
> {-# INLINE vsum #-}
> vsum (Vec4 a b c d) = a+b+c+d
>
> multList :: Int -> Vector Vec4 -> Vector Vec4
> multList !count !src
>     | count <= 0    = src
>     | otherwise     = multList (count-1) $ V.map (\v -> add (mult v m) a)
> src
>
> main = do
>     print $ Data.Vector.Storable.sum
>           $ Data.Vector.Storable.map vsum
>           $ multList repCount
>           $ Data.Vector.Storable.replicate arraySize (Vec4 0 0 0 0)
>
> repCount, arraySize :: Int
> repCount = 10000
> arraySize = 20000
> }}}
>
> Timings under Linux/64bit:
> {{{
>   * fasm: 1.502s
>   * viac: 1.525s
>   * llvm: 1.853s
> }}}
>
> This isn't universal though (as in across all targets), for example we
> get the following timings under these other targets:
>
> Windows/32bit:
> {{{
>   * fasm: 3.279s
>   * viac: 3.997s
>   * llvm: 1.932s
> }}}
>
> As for if its the 32bit or the fact that its Windows isn't known yet.

New description:

 The following program is slower when compiled via the LLVM backend then
 when compiled via C or NCG.
 {{{
 {-# LANGUAGE BangPatterns #-}

 {-
     ghc 6.12.1 -O2
     1.752
 -}

 import Data.Vector.Storable
 import qualified Data.Vector.Storable as V
 import Foreign
 import Foreign.C.Types

 -- Define a 4 element vector type
 data Vec4 = Vec4 {-# UNPACK #-} !CFloat
                  {-# UNPACK #-} !CFloat
                  {-# UNPACK #-} !CFloat
                  {-# UNPACK #-} !CFloat

 ------------------------------------------------------------------------

 -- Ensure we can store it in an array
 instance Storable Vec4 where
   sizeOf _ = sizeOf (undefined :: CFloat) * 4
   alignment _ = alignment (undefined :: CFloat)

   {-# INLINE peek #-}
   peek p = do
              a <- peekElemOff q 0
              b <- peekElemOff q 1
              c <- peekElemOff q 2
              d <- peekElemOff q 3
              return (Vec4 a b c d)
     where
       q = castPtr p

   {-# INLINE poke #-}
   poke p (Vec4 a b c d) = do
                             pokeElemOff q 0 a
                             pokeElemOff q 1 b
                             pokeElemOff q 2 c
                             pokeElemOff q 3 d
     where
       q = castPtr p

 ------------------------------------------------------------------------

 a = Vec4 0.2 0.1 0.6 1.0
 m = Vec4 0.99 0.7 0.8 0.6

 add :: Vec4 -> Vec4 -> Vec4
 {-# INLINE add #-}
 add (Vec4 a b c d) (Vec4 a' b' c' d') = Vec4 (a+a') (b+b') (c+c') (d+d')

 mult :: Vec4 -> Vec4 -> Vec4
 {-# INLINE mult #-}
 mult (Vec4 a b c d) (Vec4 a' b' c' d') = Vec4 (a*a') (b*b') (c*c') (d*d')

 vsum :: Vec4 -> CFloat
 {-# INLINE vsum #-}
 vsum (Vec4 a b c d) = a+b+c+d

 multList :: Int -> Vector Vec4 -> Vector Vec4
 multList !count !src
     | count <= 0    = src
     | otherwise     = multList (count-1) $ V.map (\v -> add (mult v m) a)
 src

 main = do
     print $ Data.Vector.Storable.sum
           $ Data.Vector.Storable.map vsum
           $ multList repCount
           $ Data.Vector.Storable.replicate arraySize (Vec4 0 0 0 0)

 repCount, arraySize :: Int
 repCount = 10000
 arraySize = 20000
 }}}

 Timings under Linux/64bit:
 {{{
   * fasm: 1.502s
   * viac: 1.525s
   * llvm: 1.853s
 }}}

 This isn't universal though (as in across all targets), for example we get
 the following timings under these other targets:

 Windows/32bit:
 {{{
   * fasm: 2.178s
   * viac: 3.997s
   * llvm: 1.932s
 }}}

 Linux/32bit:
 {{{
   * fasm: 5.233s
   * viac: 10.615s
   * llvm: 5.298s
 }}}

--

-- 
Ticket URL: <http://hackage.haskell.org/trac/ghc/ticket/4223#comment:1>
GHC <http://www.haskell.org/ghc/>
The Glasgow Haskell Compiler
_______________________________________________
Glasgow-haskell-bugs mailing list
[email protected]
http://www.haskell.org/mailman/listinfo/glasgow-haskell-bugs

Re: [GHC] #4223: LLVM slower then NCG, C example

Reply via email to