Don Stewart wrote:
Here's an example that doesn't use floating point:

import Data.Array.Vector
import Data.Bits

main = print . sumU $ zipWith3U (\x y z -> x * y * z)
(enumFromToU 1 (100000000 :: Int))
(enumFromToU 2 (100000001 :: Int))
(enumFromToU 7 (100000008 :: Int))

In core:

main_$s$wfold :: Int# -> Int# -> Int# -> Int# -> Int#
main_$s$wfold =
\ (sc_s1l1 :: Int#)
(sc1_s1l2 :: Int#)
(sc2_s1l3 :: Int#)
(sc3_s1l4 :: Int#) ->
case># sc2_s1l3 100000000 of _ {
False ->
case># sc1_s1l2 100000001 of _ {
False ->
case># sc_s1l1 100000008 of _ {
False ->
main_$s$wfold
(+# sc_s1l1 1)
(+# sc1_s1l2 1)
(+# sc2_s1l3 1)
(+#
sc3_s1l4 (*# (*# sc2_s1l3 sc1_s1l2) sc_s1l1));
True -> sc3_s1l4
};
True -> sc3_s1l4
};
True -> sc3_s1l4
}

Rather nice!

-fvia-C -optc-O3

Main_mainzuzdszdwfold_info:
cmpq $100000000, %rdi
jg .L6
cmpq $100000001, %rsi
jg .L6
cmpq $100000008, %r14
jle .L10
.L6:
movq %r8, %rbx
movq (%rbp), %rax
jmp *%rax
.L10:
movq %rsi, %r10
leaq 1(%rsi), %rsi
imulq %rdi, %r10
leaq 1(%rdi), %rdi
imulq %r14, %r10
leaq 1(%r14), %r14
leaq (%r10,%r8), %r8
jmp Main_mainzuzdszdwfold_info

Which looks ok.

$ time ./zipwith3
3541230156834269568
./zipwith3 0.33s user 0.00s system 99% cpu 0.337 total

And -fasm we get very different code, and a bit of a slowdown:

Main_mainzuzdszdwfold_info:
.Lc1mo:
cmpq $100000000,%rdi
jg .Lc1mq
cmpq $100000001,%rsi
jg .Lc1ms
cmpq $100000008,%r14
jg .Lc1mv

movq %rsi,%rax
imulq %r14,%rax
movq %rdi,%rcx
imulq %rax,%rcx
movq %r8,%rax
addq %rcx,%rax
leaq 1(%rdi),%rcx
leaq 1(%rsi),%rdx
incq %r14
movq %rdx,%rsi
movq %rcx,%rdi
movq %rax,%r8
jmp Main_mainzuzdszdwfold_info

.Lc1mq:
movq %r8,%rbx
jmp *(%rbp)
.Lc1ms:
movq %r8,%rbx
jmp *(%rbp)
.Lc1mv:
movq %r8,%rbx
jmp *(%rbp)

Slower:

$ time ./zipwith3
3541230156834269568
./zipwith3 0.38s user 0.00s system 98% cpu 0.384 total

Now maybe we need to wait on the new backend optimizations to get there?

I've been looking at some of these cases and seeing how the LLVM back-end performs. My general impression from benchmarking the LLVM back-end in the past has been that it generally performs with similar characteristics as the C code generator (that is, where the C code generator stood out compared to the NCG, so did LLVM).

(On x86-32/Mac OS X 10.5, had some issues getting x64 working at moment):

./zipWith3_viac 0.72s
./zipWith3_fasm 0.65s
./zipWith3_llvm 0.38s

Code that LLVM produces:

_Main_mainzuzdszdwfold_entry:
## BB#0: ## %c1qP
subl $12, %esp
jmp LBB2_1
.align 4, 0x90
LBB2_4: ## %n1re
## Loop Depth 1
## Loop Header is BB2_1
## Inner Loop
movl %ecx, %esi
incl %ecx
imull %eax, %esi
incl %eax
imull %edx, %esi
incl %edx
addl (%ebp), %esi
movl %edx, 12(%ebp)
movl %ecx, 8(%ebp)
movl %eax, 4(%ebp)
movl %esi, (%ebp)
LBB2_1: ## %tailrecurse
## Loop Depth 1
## Loop Header
## Inner Loop
movl 4(%ebp), %eax
cmpl $100000000, %eax
jg LBB2_5
## BB#2: ## %n1qX
## Loop Depth 1
## Loop Header is BB2_1
## Inner Loop
movl 8(%ebp), %ecx
cmpl $100000001, %ecx
jg LBB2_5
## BB#3: ## %n1r5
## Loop Depth 1
## Loop Header is BB2_1
## Inner Loop
movl 12(%ebp), %edx
cmpl $100000008, %edx
jle LBB2_4
LBB2_5: ## %c1qW
movl 16(%ebp), %eax
movl (%ebp), %esi
addl $16, %ebp
movl (%eax), %eax
addl $12, %esp
jmpl *%eax # TAILCALL

Which is very nice. (The comments in the code are inserted by LLVM, not me).

I also ran through some of the programs outlined here:

http://permalink.gmane.org/gmane.comp.lang.haskell.glasgow.user/18151

All ran with 'echo '1e-8' | ./$PRG'.

Loop.hs:
========================================
{-# LANGUAGE BangPatterns #-}
module Main (main) where

main :: IO ()
main = do
putStrLn "EPS: "
eps <- readLn :: IO Double
let !mx = (4/eps)
!pi14 = pisum mx
putStrLn $ "PI mit EPS "++(show eps)++" = "++ show(4*pi14)

pisum :: Double -> Double
pisum cut = go True 1 0
where
go b n s | cut < n = if b then s+1/(2*n) else s-1/(2*n)
go True n !s = go False (n+2) (s+recip n)
go False n !s = go True (n+2) (s-recip n)
========================================

./Loops_fasm 4.53s
./Loops_viac 4.22s
./Loops_llvm 2.89s


Fusion.hs (uses stream-fusion package)
========================================
module Main (main) where

import qualified Data.List.Stream as S

main :: IO ()
main = do
putStrLn "EPS: "
eps <- readLn :: IO Double
let !mx = floor (4/eps)
!k = (mx+1) `quot` 2
putStrLn $ "PI mit EPS " ++ (show eps) ++ " = " ++ show (leibniz k)

leibniz n = (4 *) $ S.sum $ S.take n step

step :: [Double]
step = S.unfoldr phi (True,1) where
phi (sig,d) | sig = Just (1/d, (False,d+2))
| otherwise = Just (negate (1/d), (True,d+2))
========================================

./Fusion_fasm 4.61s
./Fusion_viac 4.22s
./Fusion_llvm 3.62s


List.hs
========================================
module Main (main) where

import Data.List (unfoldr)

main :: IO ()
main = do
putStrLn "EPS: "
eps <- readLn :: IO Double
let mx = floor (4/eps)
!k = (mx+1) `quot` 2
putStrLn $ "PI mit EPS " ++ (show eps) ++ " = " ++ show (leibniz k)

leibniz n = (4 *) $ sum $ take n step

step :: [Double]
step = unfoldr phi (True,1) where
phi (sig,d) | sig = Just (1/d, (False,d+2))
| otherwise = Just (negate (1/d), (True,d+2))
========================================

./List_fasm 18.21s
./List_viac 16.71s
./List_llvm 16.92s

So with these kinds of results (obviously I'm biased though since I wrote the llvm back-end) I think the sentiment that the -fvia-C approach should be eventually removed is the right way to go since with the LLVM back-end and the new code generator there is a promising and much more interesting future.

~ David
_______________________________________________
Glasgow-haskell-users mailing list
Glasgow-haskell-users@haskell.org
http://www.haskell.org/mailman/listinfo/glasgow-haskell-users

Reply via email to