Hello,

over the weekend I played a little with the fasta benchmark at the shootout site. I tried to understand the current submission and decided to build my own version. Astonishingly, after some tuning its quite fast (<1sec on my machine, when the current
entry use ~11sec) but uses quite some memory (~11MB).
Basically you have to use a custom pseudo random number generator to build
strings of randomized DNA by choosing from a given probability distribution.
Is there some way to reduce memory consumption? 11MB seems quite a lot.

Thorsten


{-# LANGUAGE BangPatterns #-}
module Main where

import System.Environment

import qualified Data.ByteString.Lazy as L
import qualified Data.ByteString.Lazy.Char8 as C (pack, take)
import qualified Data.ByteString as S

data P = P !Char !Float
type LUT = [P]

alu =  C.pack "GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGG\
              \GAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGA\
              \CCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAAT\
              \ACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCA\
              \GCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGG\
              \AGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCC\
              \AGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAA"

iubs, homs :: LUT
iubs = cdf [('a',0.27),('c',0.12),('g',0.12),('t',0.27),('B',0.02)
           ,('D',0.02),('H',0.02),('K',0.02),('M',0.02),('N',0.02)
           ,('R',0.02),('S',0.02),('V',0.02),('W',0.02),('Y',0.02)]

homs = cdf [('a',0.3029549426680),('c',0.1979883004921)
           ,('g',0.1975473066391),('t',0.3015094502008)]

-- compile LUT from assoc list
cdf :: [(Char,Float)] -> LUT
cdf ls = reverse $! cdf' [] 0 ls
    where cdf' acc _ [] = acc
          cdf' acc c ((v,k):ls) = cdf' ((P v c'):acc) c' ls
              where !c'  = k + c

-- extract Char from List by Key
choose :: LUT -> Float -> Char
choose lut !f = choose' lut
    where choose' ((P v k):ls)| f <= k = v
                              | otherwise = choose' ls

-- PRNG
im, ia, ic :: Int
im  = 139968
ia  = 3877
ic  = 29573

data R = R !Float !Int

imd :: Float
imd = fromIntegral im

rand :: Int -> R
rand seed = R newran newseed
    where
        newseed = (seed * ia + ic) `rem` im
        newran  = (fromIntegral newseed) / imd
-- /PRNG

-- Cache all possible results
cache :: LUT -> L.ByteString
cache ls = C.pack $! reverse $! go [v] s
where (R !f !s) = rand 42 -- if we arrive at Seed 42, we completed one cycle, STOP
          !v = choose ls f
          go :: String -> Int -> String
          go acc 42 = acc
          go acc !q  = go (v':acc) s'
              where (R !f' !s') = rand q
                    !v' = choose ls f'

-- FASTA writer from current entry
fasta n s = do
    let (t:ts) = L.toChunks s
    go ts t n
  where
     go ss s n
        | n == 0     = return ()
        | l60 && n60 = S.putStrLn l >> go ss        r (n-60)
        |        n60 = S.putStr s >> S.putStrLn a >> go (tail ss) b (n-60)
        | n <= ln    = S.putStrLn (S.take n s)
        | otherwise  = S.putStr s >> S.putStrLn (S.take (n-ln) (head ss))
        where
            ln   = S.length s
            l60  = ln >= 60
            n60  = n >= 60
            (l,r) = S.splitAt 60 s
            (a,b) = S.splitAt (60-ln) (head ss)

main = do n <- getArgs >>= readIO . head
          putStrLn  ">ONE Homo sapiens alu"
          fasta (n*2) $! L.cycle alu
          putStrLn ">TWO IUB ambiguity codes"
          fasta (n*3)  (L.cycle $! cache $! iubs)
          putStrLn ">THREE Homo sapiens frequency"
fasta (n*5) (L.drop (fromIntegral (n*3) `mod` 139968) $! L.cycle $! cache homs)


_______________________________________________
Haskell-Cafe mailing list
Haskell-Cafe@haskell.org
http://www.haskell.org/mailman/listinfo/haskell-cafe

Reply via email to