Repository : ssh://darcs.haskell.org//srv/darcs/packages/base

On branch  : master

http://hackage.haskell.org/trac/ghc/changeset/ae4af61a3ac82f111c234ab0f8c1fd9f14c5e467

>---------------------------------------------------------------

commit ae4af61a3ac82f111c234ab0f8c1fd9f14c5e467
Author: Max Bolingbroke <[email protected]>
Date:   Wed Nov 2 13:44:17 2011 +0000

    Avoid using iconv for the locale TextEncoding if we can help it

>---------------------------------------------------------------

 GHC/IO/Encoding.hs       |   54 +++++++++++++++++++++++++++++-----------------
 GHC/IO/Encoding/Iconv.hs |    8 +------
 2 files changed, 35 insertions(+), 27 deletions(-)

diff --git a/GHC/IO/Encoding.hs b/GHC/IO/Encoding.hs
index 8d98d94..df14c00 100644
--- a/GHC/IO/Encoding.hs
+++ b/GHC/IO/Encoding.hs
@@ -35,6 +35,7 @@ import GHC.IO.Encoding.Types
 import GHC.Word
 #if !defined(mingw32_HOST_OS)
 import qualified GHC.IO.Encoding.Iconv  as Iconv
+import System.IO.Unsafe (unsafePerformIO)
 #else
 import qualified GHC.IO.Encoding.CodePage as CodePage
 import Text.Read (reads)
@@ -117,13 +118,23 @@ fileSystemEncoding :: TextEncoding
 foreignEncoding :: TextEncoding
 
 #if !defined(mingw32_HOST_OS)
-localeEncoding = Iconv.localeEncoding
-fileSystemEncoding = Iconv.mkLocaleEncoding RoundtripFailure
-foreignEncoding = Iconv.mkLocaleEncoding IgnoreCodingFailure
+-- It is rather important that we don't just call Iconv.mkIconvEncoding here
+-- because some iconvs (in particular GNU iconv) will brokenly UTF-8 encode
+-- lone surrogates without complaint.
+--
+-- By going through our Haskell implementations of those encodings, we are
+-- guaranteed to catch such errors.
+--
+-- FIXME: this is not a complete solution because if the locale encoding is one
+-- which we don't have a Haskell-side decoder for, iconv might still ignore the
+-- lone surrogate in the input.
+localeEncoding     = unsafePerformIO $ mkTextEncoding' ErrorOnCodingFailure 
Iconv.localeEncodingName
+fileSystemEncoding = unsafePerformIO $ mkTextEncoding' RoundtripFailure     
Iconv.localeEncodingName
+foreignEncoding    = unsafePerformIO $ mkTextEncoding' IgnoreCodingFailure  
Iconv.localeEncodingName
 #else
-localeEncoding = CodePage.localeEncoding
+localeEncoding     = CodePage.localeEncoding
 fileSystemEncoding = CodePage.mkLocaleEncoding RoundtripFailure
-foreignEncoding = CodePage.mkLocaleEncoding IgnoreCodingFailure
+foreignEncoding    = CodePage.mkLocaleEncoding IgnoreCodingFailure
 #endif
 
 -- | An encoding in which Unicode code points are translated to bytes
@@ -164,21 +175,8 @@ char8 = Latin1.latin1
 --
 mkTextEncoding :: String -> IO TextEncoding
 mkTextEncoding e = case mb_coding_failure_mode of
-  Nothing -> unknown_encoding
-  Just cfm -> case enc of
-    "UTF-8"    -> return $ UTF8.mkUTF8 cfm
-    "UTF-16"   -> return $ UTF16.mkUTF16 cfm
-    "UTF-16LE" -> return $ UTF16.mkUTF16le cfm
-    "UTF-16BE" -> return $ UTF16.mkUTF16be cfm
-    "UTF-32"   -> return $ UTF32.mkUTF32 cfm
-    "UTF-32LE" -> return $ UTF32.mkUTF32le cfm
-    "UTF-32BE" -> return $ UTF32.mkUTF32be cfm
-#if defined(mingw32_HOST_OS)
-    'C':'P':n | [(cp,"")] <- reads n -> return $ CodePage.mkCodePageEncoding 
cfm cp
-    _ -> unknown_encoding
-#else
-    _ -> Iconv.mkIconvEncoding cfm enc
-#endif
+    Nothing -> unknown_encoding
+    Just cfm -> mkTextEncoding' cfm enc
   where
     -- The only problem with actually documenting //IGNORE and //TRANSLIT as
     -- supported suffixes is that they are not necessarily supported with 
non-GNU iconv
@@ -193,6 +191,22 @@ mkTextEncoding e = case mb_coding_failure_mode of
     unknown_encoding = ioException (IOError Nothing NoSuchThing 
"mkTextEncoding"
                                             ("unknown encoding:" ++ e)  
Nothing Nothing)
 
+mkTextEncoding' :: CodingFailureMode -> String -> IO TextEncoding
+mkTextEncoding' cfm enc = case enc of
+    "UTF-8"    -> return $ UTF8.mkUTF8 cfm
+    "UTF-16"   -> return $ UTF16.mkUTF16 cfm
+    "UTF-16LE" -> return $ UTF16.mkUTF16le cfm
+    "UTF-16BE" -> return $ UTF16.mkUTF16be cfm
+    "UTF-32"   -> return $ UTF32.mkUTF32 cfm
+    "UTF-32LE" -> return $ UTF32.mkUTF32le cfm
+    "UTF-32BE" -> return $ UTF32.mkUTF32be cfm
+#if defined(mingw32_HOST_OS)
+    'C':'P':n | [(cp,"")] <- reads n -> return $ CodePage.mkCodePageEncoding 
cfm cp
+    _ -> unknown_encoding
+#else
+    _ -> Iconv.mkIconvEncoding cfm enc
+#endif
+
 latin1_encode :: CharBuffer -> Buffer Word8 -> IO (CharBuffer, Buffer Word8)
 latin1_encode input output = fmap (\(_why,input',output') -> (input',output')) 
$ Latin1.latin1_encode input output -- unchecked, used for char8
 --latin1_encode = unsafePerformIO $ do mkTextEncoder Iconv.latin1 >>= 
return.encode
diff --git a/GHC/IO/Encoding/Iconv.hs b/GHC/IO/Encoding/Iconv.hs
index 2c3ad14..50cdccb 100644
--- a/GHC/IO/Encoding/Iconv.hs
+++ b/GHC/IO/Encoding/Iconv.hs
@@ -23,7 +23,7 @@
 module GHC.IO.Encoding.Iconv (
 #if !defined(mingw32_HOST_OS)
    iconvEncoding, mkIconvEncoding,
-   localeEncoding, mkLocaleEncoding
+   localeEncodingName
 #endif
  ) where
 
@@ -65,12 +65,6 @@ localeEncodingName = unsafePerformIO $ do
    cstr <- c_localeEncoding
    peekCAString cstr -- Assume charset names are ASCII
 
-localeEncoding :: TextEncoding
-localeEncoding = mkLocaleEncoding ErrorOnCodingFailure
-
-mkLocaleEncoding :: CodingFailureMode -> TextEncoding
-mkLocaleEncoding cfm = unsafePerformIO $ mkIconvEncoding cfm localeEncodingName
-
 -- We hope iconv_t is a storable type.  It should be, since it has at least the
 -- value -1, which is a possible return value from iconv_open.
 type IConv = CLong -- ToDo: (#type iconv_t)



_______________________________________________
Cvs-libraries mailing list
[email protected]
http://www.haskell.org/mailman/listinfo/cvs-libraries

Reply via email to