Repository : ssh://darcs.haskell.org//srv/darcs/packages/base On branch : master
http://hackage.haskell.org/trac/ghc/changeset/60907a3b013fb188a0c0b1dbe8aed491b4047779 >--------------------------------------------------------------- commit 60907a3b013fb188a0c0b1dbe8aed491b4047779 Author: Max Bolingbroke <[email protected]> Date: Tue Oct 11 13:15:47 2011 +0100 Improve performance of the unpack loop >--------------------------------------------------------------- GHC/IO/Encoding/Failure.hs | 15 +++++++++++++-- GHC/IO/Handle/Text.hs | 13 ++++++++++--- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/GHC/IO/Encoding/Failure.hs b/GHC/IO/Encoding/Failure.hs index 08e084d..4d45c46 100644 --- a/GHC/IO/Encoding/Failure.hs +++ b/GHC/IO/Encoding/Failure.hs @@ -75,43 +75,54 @@ codingFailureModeSuffix RoundtripFailure = "//ROUNDTRIP" unrepresentableChar :: Char unrepresentableChar = '\xFFFD' +-- It is extraordinarily important that this series of predicates/transformers gets inlined, because +-- they tend to be used in inner loops related to text encoding. In particular, surrogatifyRoundtripCharacter +-- must be inlined (see #5536) + -- | Some characters are actually "surrogate" codepoints defined for use in UTF-16. We need to signal an -- invalid character if we detect them when encoding a sequence of 'Char's into 'Word8's because they won't -- give valid Unicode. -- -- We may also need to signal an invalid character if we detect them when encoding a sequence of 'Char's into 'Word8's -- because the 'RoundtripFailure' mode creates these to round-trip bytes through our internal UTF-16 encoding. +{-# INLINE isSurrogate #-} isSurrogate :: Char -> Bool isSurrogate c = (0xD800 <= x && x <= 0xDBFF) || (0xDC00 <= x && x <= 0xDFFF) where x = ord c -- | We use some private-use characters for roundtripping unknown bytes through a String +{-# INLINE isRoundtripEscapeChar #-} isRoundtripEscapeChar :: Char -> Bool isRoundtripEscapeChar c = 0xEF00 <= x && x < 0xF000 where x = ord c -- | We use some surrogate characters for roundtripping unknown bytes through a String +{-# INLINE isRoundtripEscapeSurrogateChar #-} isRoundtripEscapeSurrogateChar :: Char -> Bool isRoundtripEscapeSurrogateChar c = 0xDC00 <= x && x < 0xDD00 where x = ord c -- Private use characters (in Strings) --> lone surrogates (in Buffer CharBufElem) +{-# INLINE surrogatifyRoundtripCharacter #-} surrogatifyRoundtripCharacter :: Char -> Char -surrogatifyRoundtripCharacter c | isRoundtripEscapeChar c = chr (ord c - 0xEF00 + 0xDC00) +surrogatifyRoundtripCharacter c | isRoundtripEscapeChar c = chr (ord c - (0xEF00 - 0xDC00)) | otherwise = c -- Lone surrogates (in Buffer CharBufElem) --> private use characters (in Strings) +{-# INLINE desurrogatifyRoundtripCharacter #-} desurrogatifyRoundtripCharacter :: Char -> Char -desurrogatifyRoundtripCharacter c | isRoundtripEscapeSurrogateChar c = chr (ord c - 0xDC00 + 0xEF00) +desurrogatifyRoundtripCharacter c | isRoundtripEscapeSurrogateChar c = chr (ord c - (0xDC00 - 0xEF00)) | otherwise = c -- Bytes (in Buffer Word8) --> lone surrogates (in Buffer CharBufElem) +{-# INLINE escapeToRoundtripCharacterSurrogate #-} escapeToRoundtripCharacterSurrogate :: Word8 -> Char escapeToRoundtripCharacterSurrogate b | b < 128 = chr (fromIntegral b) -- Disallow 'smuggling' of ASCII bytes. For roundtripping to work, this assumes encoding is ASCII-superset. | otherwise = chr (0xDC00 + fromIntegral b) -- Lone surrogates (in Buffer CharBufElem) --> bytes (in Buffer Word8) +{-# INLINE unescapeRoundtripCharacterSurrogate #-} unescapeRoundtripCharacterSurrogate :: Char -> Maybe Word8 unescapeRoundtripCharacterSurrogate c | 0xDC80 <= x && x < 0xDD00 = Just (fromIntegral x) -- Discard high byte diff --git a/GHC/IO/Handle/Text.hs b/GHC/IO/Handle/Text.hs index 7b390cd..1f9ff11 100644 --- a/GHC/IO/Handle/Text.hs +++ b/GHC/IO/Handle/Text.hs @@ -273,6 +273,10 @@ unpack !buf !r !w acc0 unpackRB acc !i | i < r = return acc | otherwise = do + -- Here, we are rather careful to only put an *evaluated* character + -- in the output string. Due to pointer tagging, this allows the consumer + -- to avoid ping-ponging between the actual consumer code and the + -- code of desurrogatifyRoundtripCharacter #ifdef CHARBUF_UTF16 -- reverse-order decoding of UTF-16 c2 <- peekElemOff pbuf i @@ -281,10 +285,12 @@ unpack !buf !r !w acc0 else do c1 <- peekElemOff pbuf (i-1) let c = (fromIntegral c1 - 0xd800) * 0x400 + (fromIntegral c2 - 0xdc00) + 0x10000 - unpackRB (desurrogatifyRoundtripCharacter (unsafeChr c) : acc) (i-2) + c' = desurrogatifyRoundtripCharacter (unsafeChr c) + c' `seq` unpackRB (c' : acc) (i-2) #else c <- peekElemOff pbuf i - unpackRB (desurrogatifyRoundtripCharacter c:acc) (i-1) + let c' = desurrogatifyRoundtripCharacter c + c' `seq` unpackRB (c':acc) (i-1) #endif in unpackRB acc0 (w-1) @@ -307,7 +313,8 @@ unpack_nl !buf !r !w acc0 then unpackRB ('\n':acc) (i-2) else unpackRB ('\n':acc) (i-1) else do - unpackRB (desurrogatifyRoundtripCharacter c:acc) (i-1) + let c' = desurrogatifyRoundtripCharacter c + c' `seq` unpackRB (c':acc) (i-1) in do c <- peekElemOff pbuf (w-1) if (c == '\r') _______________________________________________ Cvs-libraries mailing list [email protected] http://www.haskell.org/mailman/listinfo/cvs-libraries
