Attoparsec does not have something like the Stream class, so I do not see how I could do UTF8 parsing easily.

On Jan 17, 2009, at 11:50 PM, Don Stewart wrote:

It occurs to me you could also use attoparsec, which is specifically
optimised for bytestring processing.

sjoerd:
Hi,

Somebody told me about Parsec 3, which uses a Stream type class so it
can parse any data type. This sounded like the right way to do
encoding independent parsing, so I decided to see how it would work to
parse UTF8 JSON.

Sadly I could not use Text.JSON.Parsec directly, because it uses the
old Parsec CharParser type. So I copied to code, and also replaced
p_number with the "floating" parser from Text.Parsec.Token, because
Text.JSON.Parsec uses readFloat (a dirty hack imho) which works only
on String.

If Text.JSON.Parsec was written for Parsec 3, the only thing to write
to get UTF8 JSON parsing would be:

instance (Monad m, U.UTF8Bytes string index) => Stream (U.UTF8 string)
m Char where
   uncons = return . U.uncons

I did not do any performance measuring yet, I was glad I got it
working. Any comments on the code is appreciated!

greetings,
Sjoerd Visscher

{-# LANGUAGE FlexibleInstances, MultiParamTypeClasses,
UndecidableInstances #-}
import qualified Data.String.UTF8 as U
import qualified Data.ByteString as B

import Text.Parsec hiding (many, optional, (<|>))
import Control.Applicative

import Text.JSON.Types
import Control.Monad
import Data.Char
import Numeric

instance (Monad m, U.UTF8Bytes string index) => Stream (U.UTF8 string)
m Char where
   uncons = return . U.uncons

type CharParser st = Parsec (U.UTF8 B.ByteString) st

parseFile :: FilePath -> IO (Either ParseError JSValue)
parseFile fileName = do
 bs <- B.readFile fileName
 return $ runParser json () fileName (U.fromRep bs)

parseString      :: String -> Either ParseError JSValue
parseString s     = runParser json () "(unknown)" (U.fromString s)

json             :: CharParser () JSValue
json              = spaces *> p_value

tok              :: CharParser () a -> CharParser () a
tok p             = p <* spaces

p_value          :: CharParser () JSValue
p_value           =  (JSNull      <$  p_null)
                <|> (JSBool      <$> p_boolean)
                <|> (JSArray     <$> p_array)
                <|> (JSString    <$> p_js_string)
                <|> (JSObject    <$> p_js_object)
                <|> (JSRational False <$> p_number)
                <?> "JSON value"

p_null           :: CharParser () ()
p_null            = tok (string "null") >> return ()

p_boolean        :: CharParser () Bool
p_boolean         = tok
                     (  (True  <$ string "true")
                    <|> (False <$ string "false")
                     )

p_array          :: CharParser () [JSValue]
p_array           = between (tok (char '[')) (tok (char ']'))
                 $ p_value `sepBy` tok (char ',')

p_string         :: CharParser () String
p_string          = between (tok (char '"')) (char '"') (many p_char)
 where p_char    =  (char '\\' >> p_esc)
                <|> (satisfy (\x -> x /= '"' && x /= '\\'))

       p_esc     =  ('"'   <$ char '"')
                <|> ('\\'  <$ char '\\')
                <|> ('/'   <$ char '/')
                <|> ('\b'  <$ char 'b')
                <|> ('\f'  <$ char 'f')
                <|> ('\n'  <$ char 'n')
                <|> ('\r'  <$ char 'r')
                <|> ('\t'  <$ char 't')
                <|> (char 'u' *> p_uni)
                <?> "escape character"

       p_uni     = check =<< count 4 (satisfy isHexDigit)
         where check x | code <= max_char  = pure (toEnum code)
                       | otherwise         = empty
                 where code      = fst $ head $ readHex x
                       max_char  = fromEnum (maxBound :: Char)

p_object         :: CharParser () [(String,JSValue)]
p_object          = between (tok (char '{')) (tok (char '}'))
                 $ p_field `sepBy` tok (char ',')
 where p_field   = (,) <$> (p_string <* tok (char ':')) <*> p_value

p_number         :: CharParser () Rational
p_number          = tok floating where

   floating       :: CharParser () Rational
   floating        = do{ n <- decimal
                       ; fract <- option 0 fraction
                       ; expo  <- option 1 exponent'
                       ; return ((fromInteger n + fract)*expo)
                       }

   fraction        = do{ char '.'
                       ; digits <- many1 digit <?> "fraction"
                       ; return (foldr op 0 digits)
                       }
                     <?> "fraction"
                   where
                     op d f    = (f + fromIntegral (digitToInt d))/10

   exponent'       = do{ oneOf "eE"
                       ; f <- sign
                       ; e <- decimal <?> "exponent"
                       ; return (power (f e))
                       }
                     <?> "exponent"
                   where
                      power e  | e < 0      = 1/power(-e)
                               | otherwise  = fromInteger (10^e)

   sign            =   (char '-' >> return negate)
                   <|> (char '+' >> return id)
                   <|> return id

   decimal         = number 10 digit

   number base baseDigit
       = do{ digits <- many1 baseDigit
           ; let n = foldl (\x d -> base*x + toInteger (digitToInt
d)) 0 digits
           ; seq n (return n)
           }


p_js_string      :: CharParser () JSString
p_js_string       = toJSString <$> p_string

p_js_object      :: CharParser () (JSObject JSValue)
p_js_object       = toJSObject <$> p_object


_______________________________________________
Haskell-Cafe mailing list
Haskell-Cafe@haskell.org
http://www.haskell.org/mailman/listinfo/haskell-cafe

--
Sjoerd Visscher
sjo...@w3future.com



_______________________________________________
Haskell-Cafe mailing list
Haskell-Cafe@haskell.org
http://www.haskell.org/mailman/listinfo/haskell-cafe

Reply via email to