[Haskell-cafe] Fast mutable arrays of ByteString?

Maxime Henrion Wed, 15 Apr 2009 08:40:59 -0700

        Hello all,


I have been rewriting a small utility program of mine from C to Haskell
for fun.  This tool reads lines from stdin or from files, shuffles them
one or more times using the Fisher-Yates algorithm, and outputs the
result to stdout.

Since this algorithm is based on in-place updates, I've been storing my
strings in a mutable array in the ST monad.  Since it's holding strings
I could not use an unboxed array.  The resulting program works fine and
seems to run at a decent speed, even though it is much slower than the
original C version, slightly more so than I expected.

While trying to optimize it using profiling, and playing with the number
of shuffling passes, I noticed that this operation was responsible for a
significant amount of the runtime, much more so than with the C version.
I also noticed that the %GC time was around 56%.

In order to do more tests, I wrote another version of this program which
keeps the strings in a pure and immutable array, and stores the indices
of this array in an unboxed mutable ST array.  The shuffling is then
done on this indices array instead of the strings array.

This version runs much faster and only spends ~21% of its time in the
garbage collector, at the cost of consuming more memory for the indices
array.

I'm attaching both versions of the code to this e-mail, and I'd be
curious to hear about any possible improvements to it, and whether the
performance of STArray of ByteString I'm observing corresponds to
people's expectations.

Thanks in advance,
Maxime Henrion

{-# LANGUAGE BangPatterns #-}
module Main where

import Control.Monad
import Control.Monad.ST
 
import Data.Array.ST
import Data.ByteString.Char8 (ByteString)
import qualified Data.ByteString.Char8 as S
import Data.Maybe

import System.Console.GetOpt
import System.Environment
import System.IO
import System.Random

data Config = Config
  { separator :: Char
  , numPasses :: Int
  } deriving (Eq, Show)

defaultConfig = Config
  { separator = '\n'
  , numPasses = 1
  }
                
options :: [OptDescr (Config -> Config)]
options =
  [ Option ['0'] [] (NoArg (\cfg -> cfg { separator = '\0' }))
      "expect ASCII NUL characters as separators"
  , Option ['d'] [] (OptArg ((\d cfg -> cfg { separator = d }) . maybe '\n' head) "delim")
      "use provided character as separator"
  , Option ['n'] [] (OptArg ((\n cfg -> cfg { numPasses = n }) . readNumPasses) "npass")
      "run # passes instead of just 1"
  ]
 where readNumPasses = maybe 1 (headMaybe 1 fst . reads)
       headMaybe _ f [x] = f x
       headMaybe d _ _   = d

main :: IO ()
main = do argv <- getArgs
          case getOpt RequireOrder options argv of
            (opts, args, []) -> do g     <- newStdGen
                                   input <- getInput args
                                   let Config { separator = sep
                                              , numPasses = n
                                              } = foldl (flip id) defaultConfig opts
                                   forM_ (shuffle g n (slice sep input)) $ \ln ->
                                     S.putStr ln >> putChar sep
            (_,  _, (err:_)) -> do prog <- getProgName
                                   hPutStr stderr (prog ++ ": " ++ err)

getInput :: [FilePath] -> IO ByteString
getInput [] = S.getContents
getInput ps = S.concat `fmap` mapM S.readFile ps

slice :: Char -> ByteString -> (Int, [ByteString])
slice d s = slice' d s 1 []
  where
    slice' d s !count lns =
      case S.elemIndex d s of
        Nothing                -> (count, s:lns)
        Just n
         | n == S.length s - 1 -> (count, S.take n s:lns) -- ignore empty trailing lines
         | otherwise           -> slice' d (S.drop (n + 1) s) (count + 1) (S.take n s:lns)
{-# INLINE slice #-}

shuffle :: RandomGen g => g -> Int -> (Int, [ByteString]) -> [ByteString]
shuffle g n (count,lns) =
  runST $ do
    arr <- newListArray (0,count - 1) lns :: ST s (STArray s Int ByteString)
    forM_ swaps $ \(i,j) ->
      when (i /= j) $ do
        tmp <- readArray arr i
        readArray arr j >>= writeArray arr i
        writeArray arr j tmp 
    getElems arr
  where swaps = zipWith (\n k -> (n - 1, k `mod` n))
                  (concat $ replicate n [count,count-1..2]) (randoms g)

{-# LANGUAGE BangPatterns #-}
module Main where

import Control.Monad
import Control.Monad.ST
 
import Data.Array
import Data.Array.ST
import Data.ByteString.Char8 (ByteString)
import qualified Data.ByteString.Char8 as S
import Data.Maybe

import System.Console.GetOpt
import System.Environment
import System.IO
import System.Random

data Config = Config
  { separator :: Char
  , numPasses :: Int
  } deriving (Eq, Show)

defaultConfig = Config
  { separator = '\n'
  , numPasses = 1
  }
                
options :: [OptDescr (Config -> Config)]
options =
  [ Option ['0'] [] (NoArg (\cfg -> cfg { separator = '\0' }))
      "expect ASCII NUL characters as separators"
  , Option ['d'] [] (OptArg ((\d cfg -> cfg { separator = d }) . maybe '\n' head) "delim")
      "use provided character as separator"
  , Option ['n'] [] (OptArg ((\n cfg -> cfg { numPasses = n }) . readNumPasses) "npass")
      "run # passes instead of just 1"
  ]
 where readNumPasses = maybe 1 (headMaybe 1 fst . reads)
       headMaybe _ f [x] = f x
       headMaybe d _ _   = d

main :: IO ()
main = do argv <- getArgs
          case getOpt RequireOrder options argv of
            (opts, args, []) -> do g     <- newStdGen
                                   input <- getInput args
                                   let Config { separator = sep
                                              , numPasses = n
                                              } = foldl (flip id) defaultConfig opts
                                   forM_ (shuffle g n (slice sep input)) $ \ln ->
                                     S.putStr ln >> putChar sep
            (_,  _, (err:_)) -> do prog <- getProgName
                                   hPutStr stderr (prog ++ ": " ++ err)

getInput :: [FilePath] -> IO ByteString
getInput [] = S.getContents
getInput ps = S.concat `fmap` mapM S.readFile ps

slice :: Char -> ByteString -> Array Int ByteString
slice d s = slice' d s 0 []
  where
    slice' d s !count lns =
      case S.elemIndex d s of
        Nothing                -> array (0, count) ((count, s):lns)
        Just n
         | n == S.length s - 1 -> array (0, count) ((count, S.take n s):lns) -- ignore empty trailing lines
         | otherwise           -> slice' d (S.drop (n+1) s) (count+1) ((count, S.take n s):lns)
{-# INLINE slice #-}

shuffle :: RandomGen g => g -> Int -> Array Int ByteString -> [ByteString]
shuffle g n strings =
  runST $ do
    let bnds = bounds strings
        size = snd bnds + 1
    idxs <- newListArray bnds [0..] :: ST s (STUArray s Int Int)
    forM_ (swaps size) $ \(i,j) ->
      when (i /= j) $ do
        tmp <- readArray idxs i
        readArray idxs j >>= writeArray idxs i
        writeArray idxs j tmp 
    map (strings !) `fmap` getElems idxs
  where swaps count = zipWith (\n k -> (n - 1, k `mod` n))
                       (concat $ replicate n [count,count-1..2]) (randoms g)

_______________________________________________
Haskell-Cafe mailing list
Haskell-Cafe@haskell.org
http://www.haskell.org/mailman/listinfo/haskell-cafe

[Haskell-cafe] Fast mutable arrays of ByteString?

Reply via email to