Hello all,

So I'm processing a large XML file which is a database of about 170k
entries, each of which is a reasonable enough size on its own, and I only
need streaming access to the database (basically printing out summary data
for each entry). Excellent, sounds like a job for SAX.

However, after whipping up a simplified version of the program using
hexpat, there's a space leak. Near as I can tell, it's not a problem with
my code, it's a problem with Data.List.Class (or hexpat's use thereof).
The simplified code follows, just compile it for profiling and use hp2ps
to see what I mean. The file I'm running it on can be found at:


Any ideas on what the problem really is, or how to fix it?

module JMdict (main) where

import           Text.XML.Expat.SAX   (SAXEvent(..))
import qualified Text.XML.Expat.SAX   as SAX
import           Text.XML.Expat.Tree  (NodeG(..))
import qualified Text.XML.Expat.Tree  as DOM
import qualified Text.XML.Expat.Proc  as Proc
import qualified Text.XML.Expat.Internal.NodeClass as Node
import qualified Data.ByteString.Lazy as BL
import           Data.Char            (isSpace)
import           Data.Text.IO         as TIO
import qualified Data.Text            as T
import           Control.Applicative  ((<$>))
import           Control.Monad        (forM_)
import qualified System.IO            as IO
import qualified System.Environment   as Sys (getArgs)
import qualified System.Exit          as Sys (exitFailure)
import qualified System.Directory     as Sys
    (doesFileExist, getPermissions, readable)

-- | A variant of 'Control.Monad.unless' for when the boolean is
-- also monadic.
unlessM :: Monad m => m Bool -> m () -> m ()
unlessM mb handle = do
    b <- mb
    if b then return () else handle

-- | If the file does not exist or is not readable, then crash the
-- program.
assertFileExistsReadable :: FilePath -> IO ()
assertFileExistsReadable file = do
    unlessM (Sys.doesFileExist file) $ do
        IO.hPutStrLn IO.stderr $ "No such file: "++file
    unlessM (Sys.readable <$> Sys.getPermissions file) $ do
        IO.hPutStrLn IO.stderr $ "File not readable: "++file

main :: IO ()
main = do
    files <- Sys.getArgs
    forM_ files $ \file -> do
        assertFileExistsReadable file
        countElements 0
            . filter (not . isWhitespace)
            . dropPreamble
            . SAX.parse SAX.defaultParseOptions
            =<< BL.readFile file
    dropPreamble (StartElement t _ : xs) | t == T.pack "JMdict" = xs
    dropPreamble (_:xs) = dropPreamble xs
    dropPreamble []     = []

    isWhitespace (CharacterData c) | T.all isSpace c = True
    isWhitespace _                                   = False

    countElements :: Int -> [SAXEvent T.Text T.Text] -> IO ()
    countElements n [] = print n
    countElements n xs =
        case anyElement xs of
        (Left  err, xs') -> fail $ show err ++": "++ show (take 3 xs')
        (Right ell, xs') -> do
            print (n+1)
            (countElements $! n+1) xs'

data ElementError
    = EmptyStream
    | NoStartElement
    | EndOfStream
    | InvalidXML
    deriving (Read, Show, Eq)

-- | Split an event stream into an initial element and the remainder
-- of the stream. Use 'DOM.saxToTree' to convert the element to a
-- tree.
    :: (Eq tag)
    => [SAXEvent tag text]
    -> (Either ElementError [SAXEvent tag text], [SAXEvent tag text])
anyElement = start
    start []         = (Left EmptyStream, [])
    start xxs@(x:xs) =
        case x of
        StartElement t _ -> loop [t] (x:) xs
        _                -> (Left NoStartElement, xxs)

    loop _          _ []         = (Left EndOfStream, [])
    loop []         k xs         = (Right (k []), xs)
    loop tts@(t:ts) k xxs@(x:xs) = step (k . (x:)) xs
        step =
            case x of
            StartElement t' _ -> loop (t':tts)
            EndElement   t'
                | t' == t     -> loop ts
                | otherwise   -> \_ _ -> (Left InvalidXML, xxs)
            _                 -> loop tts

----------------------------------------------------------- fin.

Live well,

Haskell-Cafe mailing list

Reply via email to