Repository : ssh://darcs.haskell.org//srv/darcs/packages/base

On branch  : master

http://hackage.haskell.org/trac/ghc/changeset/60907a3b013fb188a0c0b1dbe8aed491b4047779

>---------------------------------------------------------------

commit 60907a3b013fb188a0c0b1dbe8aed491b4047779
Author: Max Bolingbroke <[email protected]>
Date:   Tue Oct 11 13:15:47 2011 +0100

    Improve performance of the unpack loop

>---------------------------------------------------------------

 GHC/IO/Encoding/Failure.hs |   15 +++++++++++++--
 GHC/IO/Handle/Text.hs      |   13 ++++++++++---
 2 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/GHC/IO/Encoding/Failure.hs b/GHC/IO/Encoding/Failure.hs
index 08e084d..4d45c46 100644
--- a/GHC/IO/Encoding/Failure.hs
+++ b/GHC/IO/Encoding/Failure.hs
@@ -75,43 +75,54 @@ codingFailureModeSuffix RoundtripFailure           = 
"//ROUNDTRIP"
 unrepresentableChar :: Char
 unrepresentableChar = '\xFFFD'
 
+-- It is extraordinarily important that this series of predicates/transformers 
gets inlined, because
+-- they tend to be used in inner loops related to text encoding. In 
particular, surrogatifyRoundtripCharacter
+-- must be inlined (see #5536)
+
 -- | Some characters are actually "surrogate" codepoints defined for use in 
UTF-16. We need to signal an
 -- invalid character if we detect them when encoding a sequence of 'Char's 
into 'Word8's because they won't
 -- give valid Unicode.
 --
 -- We may also need to signal an invalid character if we detect them when 
encoding a sequence of 'Char's into 'Word8's
 -- because the 'RoundtripFailure' mode creates these to round-trip bytes 
through our internal UTF-16 encoding.
+{-# INLINE isSurrogate #-}
 isSurrogate :: Char -> Bool
 isSurrogate c = (0xD800 <= x && x <= 0xDBFF) || (0xDC00 <= x && x <= 0xDFFF)
   where x = ord c
 
 -- | We use some private-use characters for roundtripping unknown bytes 
through a String
+{-# INLINE isRoundtripEscapeChar #-}
 isRoundtripEscapeChar :: Char -> Bool
 isRoundtripEscapeChar c = 0xEF00 <= x && x < 0xF000
   where x = ord c
 
 -- | We use some surrogate characters for roundtripping unknown bytes through 
a String
+{-# INLINE isRoundtripEscapeSurrogateChar #-}
 isRoundtripEscapeSurrogateChar :: Char -> Bool
 isRoundtripEscapeSurrogateChar c = 0xDC00 <= x && x < 0xDD00
   where x = ord c
 
 -- Private use characters (in Strings) --> lone surrogates (in Buffer 
CharBufElem)
+{-# INLINE surrogatifyRoundtripCharacter #-}
 surrogatifyRoundtripCharacter :: Char -> Char
-surrogatifyRoundtripCharacter c | isRoundtripEscapeChar c = chr (ord c - 
0xEF00 + 0xDC00)
+surrogatifyRoundtripCharacter c | isRoundtripEscapeChar c = chr (ord c - 
(0xEF00 - 0xDC00))
                                 | otherwise               = c
 
 -- Lone surrogates (in Buffer CharBufElem) --> private use characters (in 
Strings)
+{-# INLINE desurrogatifyRoundtripCharacter #-}
 desurrogatifyRoundtripCharacter :: Char -> Char
-desurrogatifyRoundtripCharacter c | isRoundtripEscapeSurrogateChar c = chr 
(ord c - 0xDC00 + 0xEF00)
+desurrogatifyRoundtripCharacter c | isRoundtripEscapeSurrogateChar c = chr 
(ord c - (0xDC00 - 0xEF00))
                                   | otherwise                        = c
 
 -- Bytes (in Buffer Word8) --> lone surrogates (in Buffer CharBufElem)
+{-# INLINE escapeToRoundtripCharacterSurrogate #-}
 escapeToRoundtripCharacterSurrogate :: Word8 -> Char
 escapeToRoundtripCharacterSurrogate b
   | b < 128   = chr (fromIntegral b) -- Disallow 'smuggling' of ASCII bytes. 
For roundtripping to work, this assumes encoding is ASCII-superset.
   | otherwise = chr (0xDC00 + fromIntegral b)
 
 -- Lone surrogates (in Buffer CharBufElem) --> bytes (in Buffer Word8)
+{-# INLINE unescapeRoundtripCharacterSurrogate #-}
 unescapeRoundtripCharacterSurrogate :: Char -> Maybe Word8
 unescapeRoundtripCharacterSurrogate c
     | 0xDC80 <= x && x < 0xDD00 = Just (fromIntegral x) -- Discard high byte
diff --git a/GHC/IO/Handle/Text.hs b/GHC/IO/Handle/Text.hs
index 7b390cd..1f9ff11 100644
--- a/GHC/IO/Handle/Text.hs
+++ b/GHC/IO/Handle/Text.hs
@@ -273,6 +273,10 @@ unpack !buf !r !w acc0
         unpackRB acc !i
          | i < r  = return acc
          | otherwise = do
+              -- Here, we are rather careful to only put an *evaluated* 
character
+              -- in the output string. Due to pointer tagging, this allows the 
consumer
+              -- to avoid ping-ponging between the actual consumer code and the
+              -- code of desurrogatifyRoundtripCharacter
 #ifdef CHARBUF_UTF16
               -- reverse-order decoding of UTF-16
               c2 <- peekElemOff pbuf i
@@ -281,10 +285,12 @@ unpack !buf !r !w acc0
                  else do c1 <- peekElemOff pbuf (i-1)
                          let c = (fromIntegral c1 - 0xd800) * 0x400 +
                                  (fromIntegral c2 - 0xdc00) + 0x10000
-                         unpackRB (desurrogatifyRoundtripCharacter (unsafeChr 
c) : acc) (i-2)
+                             c' = desurrogatifyRoundtripCharacter (unsafeChr c)
+                         c' `seq` unpackRB (c' : acc) (i-2)
 #else
               c <- peekElemOff pbuf i
-              unpackRB (desurrogatifyRoundtripCharacter c:acc) (i-1)
+              let c' = desurrogatifyRoundtripCharacter c
+              c' `seq` unpackRB (c':acc) (i-1)
 #endif
      in
      unpackRB acc0 (w-1)
@@ -307,7 +313,8 @@ unpack_nl !buf !r !w acc0
                             then unpackRB ('\n':acc) (i-2)
                             else unpackRB ('\n':acc) (i-1)
                  else do
-                         unpackRB (desurrogatifyRoundtripCharacter c:acc) (i-1)
+                         let c' = desurrogatifyRoundtripCharacter c
+                         c' `seq` unpackRB (c':acc) (i-1)
      in do
      c <- peekElemOff pbuf (w-1)
      if (c == '\r')



_______________________________________________
Cvs-libraries mailing list
[email protected]
http://www.haskell.org/mailman/listinfo/cvs-libraries

Reply via email to