Dear Haskellers, Another story from an (almost) happy Haskell user that finds himself overwhelmed by laziness/space leaks.
I'm trying to parse a large file (>600MB) with a single S-expression like structure. With the help of ByteStrings I'm down to 4min processing time in constant space. However, when I try to wrap the parse results in a data structure, the heap blows up - even though I never actually inspect the structure being built! This bugs me, so I come here looking for answers. Parser follows: > module Main where > > import qualified Data.ByteString.Lazy.Char8 as B > import Text.ParserCombinators.Parsec > import Text.ParserCombinators.Parsec.Pos > import System.Environment > import System.Exit > import qualified Data.Map as M > import Lexer > > type XdlParser a = GenParser Token XdlState a > > -- Parser state > type XdlState = Counts > > type Counts = M.Map Count Integer > > data Count = ListCount | SymbolCount > deriving (Eq, Ord, Show) > > emptyXdlState = M.empty > > incCount :: Count -> (Counts -> Counts) > incCount c = M.insertWith' (+) c 1 > > -- handling tokens > myToken :: (Token -> Maybe a) -> XdlParser a > myToken test = token showTok posTok testTok > where > showTok = show > posTok = const (initialPos "") > testTok = test > > -- Syntax of expressions > data Exp = Sym !B.ByteString | List ![Exp] > deriving (Eq, Show) > > expr = list <|> symbol > > rparen = myToken $ \t -> case t of > RParen -> Just () > other -> Nothing > > lparen = myToken $ \t -> case t of > LParen -> Just () > other -> Nothing > > name = myToken $ \t -> case t of > Name n -> Just n > other -> Nothing > > list = do > updateState $ incCount ListCount > lparen > xs <- many1 expr > rparen > return () > -- return $! (List xs) > > symbol = do > updateState $ incCount SymbolCount > name >> return () > -- Sym `fmap` name > > -- Top level parser > top :: XdlParser XdlState > top = do > l <- many1 list > eof > getState > > main = do > args <- getArgs > case args of > [fname] -> do > text <- B.readFile fname > let result = runParser top emptyXdlState fname (tokenize text) > putStrLn $ either show show result > _ -> putStrLn "usage: parse filename" >> exitFailure And the Lexer: > module Lexer (Token(..), tokenize) where > > import qualified Data.ByteString.Lazy.Char8 as B > import Control.Monad > import Data.Char > import Data.List > import System.Environment > import System.Exit > > data Token = LParen > | RParen > | Name B.ByteString > deriving (Ord, Eq, Show) > > type Input = B.ByteString > > -- Processor returns Nothing if it can't process the Input > type Processor = Input -> Maybe ([Token], Input) > > -- Tokenize ends the list when all processors return Nothing > tokenize :: Input -> [Token] > tokenize = concat . unfoldr processAll > where > processors = [doSpaces, doComment, doParens, doName] > processAll :: Processor > processAll bs = if B.null bs > then Nothing > else foldr mminus Nothing $ map ($ bs) processors > mminus a@(Just _) _ = a > mminus Nothing b = b > > doSpaces :: Processor > doSpaces bs = > if B.null sp > then Nothing > else Just ([], nsp) > where > (sp, nsp) = B.span isSpace bs > > doComment :: Processor > doComment bs = > if B.pack "# " `B.isPrefixOf` bs > then Just ([], B.dropWhile (/= '\n') bs) > else Nothing > > doParens :: Processor > doParens bs = case B.head bs of > '(' -> Just ([LParen], B.tail bs) > ')' -> Just ([RParen], B.tail bs) > _ -> Nothing > > doName :: Processor > doName bs = > if B.null nsp > then Nothing > else Just ([Name nsp], sp) > where > (nsp, sp) = B.span (not . isRest) bs > isRest c = isSpace c || c == ')' || c == '(' Regards, -- Krzysztof Kościuszkiewicz Skype: dr.vee, Gadu: 111851, Jabber: [EMAIL PROTECTED] "Simplicity is the ultimate sophistication" -- Leonardo da Vinci _______________________________________________ Haskell-Cafe mailing list Haskell-Cafe@haskell.org http://www.haskell.org/mailman/listinfo/haskell-cafe