[pypy-commit] pypy unicode-utf8-py3: win32 fixes

mattip Sun, 16 Sep 2018 22:46:57 -0700

Author: Matti Picus <[email protected]>
Branch: unicode-utf8-py3
Changeset: r95135:8a769610ff91
Date: 2018-09-17 08:02 +0300
http://bitbucket.org/pypy/pypy/changeset/8a769610ff91/


Log:    win32 fixes

diff --git a/TODO b/TODO
--- a/TODO
+++ b/TODO
@@ -6,3 +6,7 @@
 * make sure we review all the places that call ord(unichr) to check for 
ValueErrors
 * rewrite unicodeobject.unicode_to_decimal_w to only use utf8 encoded bytes
 * revisit why runicode import str_decode_utf_8_impl needed instead of runicode 
import str_decode_utf_8
+* revisit all places where we do utf8.decode('utf-8'), they should work 
directly with utf8
+  - rutf8.utf8_encode_mbcs
+  - unicodehelper.fsencode
+  - interp_posix.FileEncoder.as_unicode (used in win32)
diff --git a/pypy/interpreter/unicodehelper.py 
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -77,16 +77,15 @@
 def fsdecode(space, w_string):
     from pypy.module._codecs import interp_codecs
     state = space.fromcache(interp_codecs.CodecState)
+    errorhandler=state.decode_error_handler,
     if _WIN32:
         bytes = space.bytes_w(w_string)
-        uni = str_decode_mbcs(bytes, 'strict',
-                              errorhandler=decode_error_handler(space),
+        uni = str_decode_mbcs(bytes, 'strict', True, errorhandler,
                               force_ignore=False)[0]
     elif _MACOSX:
         bytes = space.bytes_w(w_string)
         uni = str_decode_utf8(
             bytes, 'surrogateescape', final=True,
-            errorhandler=state.decode_error_handler,
             allow_surrogates=False)[0]
     elif space.sys.filesystemencoding is None or state.codec_need_encodings:
         # bootstrap check: if the filesystemencoding isn't initialized
@@ -109,15 +108,14 @@
     from pypy.module._codecs import interp_codecs
     state = space.fromcache(interp_codecs.CodecState)
     if _WIN32:
-        uni = space.utf8_w(w_uni)
-        bytes = unicode_encode_mbcs(uni, len(uni), 'strict',
-                                    errorhandler=encode_error_handler(space),
-                                    force_replace=False)
+        errorhandler=state.encode_error_handler,
+        utf8 = space.utf8_w(w_uni)
+        bytes = utf8_encode_mbcs(utf8, 'strict', errorhandler)
     elif _MACOSX:
-        uni = space.utf8_w(w_uni)
+        utf8 = space.utf8_w(w_uni)
+        errorhandler=state.encode_error_handler,
         bytes = unicodehelper.utf8_encode_utf_8(
-            uni, 'surrogateescape',
-            errorhandler=state.encode_error_handler,
+            utf8, 'surrogateescape',
             allow_surrogates=False)
     elif space.sys.filesystemencoding is None or state.codec_need_encodings:
         # bootstrap check: if the filesystemencoding isn't initialized
@@ -314,16 +312,12 @@
 
 if _WIN32:
     def utf8_encode_mbcs(s, errors, errorhandler):
-        s = s.decode('utf-8')
-        if errorhandler is None:
-            errorhandler = encode_error_handler(space)
-        res = unicode_encode_mbcs(s, slen, errors, errorhandler)
+        res = rutf8.utf8_encode_mbcs(s, errors, errorhandler,
+                                     force_replace=False)
         return res
         
     def str_decode_mbcs(s, errors, final, errorhandler, force_ignore=True):
         slen = len(s)
-        if errorhandler is None:
-            errorhandler = decode_error_handler(space) 
         res, size = runicode.str_decode_mbcs(s, slen, errors, final=final,
                            errorhandler=errorhandler, 
force_ignore=force_ignore)
         res_utf8 = runicode.unicode_encode_utf_8(res, len(res), 'strict')
diff --git a/pypy/module/posix/interp_posix.py 
b/pypy/module/posix/interp_posix.py
--- a/pypy/module/posix/interp_posix.py
+++ b/pypy/module/posix/interp_posix.py
@@ -56,7 +56,7 @@
         return self.space.fsencode_w(self.w_obj)
 
     def as_unicode(self):
-        return self.space.unicode0_w(self.w_obj)
+        return self.space.utf8_w(self.w_obj).decode('utf8')
 
 class FileDecoder(object):
     is_unicode = False
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -810,3 +810,63 @@
             res.append_slice(s, start, end)
             i = end
     return res.build()
+
+# ____________________________________________________________
+# MBCS codecs for Windows
+
+if sys.platform == 'win32':
+    from rpython.rtyper.lltypesystem import lltype, rffi
+    from rpython.rlib.runicode import CP_ACP, BOOLP, WideCharToMultiByte
+    from rpython.rlib import rwin32
+
+    def utf8_encode_mbcs(s, errors, errorhandler,
+                            force_replace=True):
+        # TODO: do the encoding without decoding utf8 -> unicode
+        uni = s.decode('utf8')
+        lgt = len(uni)
+        if not force_replace and errors not in ('strict', 'replace'):
+            msg = "mbcs encoding does not support errors='%s'" % errors
+            errorhandler('strict', 'mbcs', msg, s, 0, 0)
+
+        if lgt == 0:
+            return ''
+
+        if force_replace or errors == 'replace':
+            flags = 0
+            used_default_p = lltype.nullptr(BOOLP.TO)
+        else:
+            # strict
+            flags = rwin32.WC_NO_BEST_FIT_CHARS
+            used_default_p = lltype.malloc(BOOLP.TO, 1, flavor='raw')
+            used_default_p[0] = rffi.cast(rwin32.BOOL, False)
+
+        try:
+            with rffi.scoped_nonmoving_unicodebuffer(uni) as dataptr:
+                # first get the size of the result
+                mbcssize = WideCharToMultiByte(CP_ACP, flags,
+                                               dataptr, lgt, None, 0,
+                                               None, used_default_p)
+                if mbcssize == 0:
+                    raise rwin32.lastSavedWindowsError()
+                # If we used a default char, then we failed!
+                if (used_default_p and
+                    rffi.cast(lltype.Bool, used_default_p[0])):
+                    errorhandler('strict', 'mbcs', "invalid character",
+                                 s, 0, 0)
+
+                with rffi.scoped_alloc_buffer(mbcssize) as buf:
+                    # do the conversion
+                    if WideCharToMultiByte(CP_ACP, flags,
+                                           dataptr, lgt, buf.raw, mbcssize,
+                                           None, used_default_p) == 0:
+                        raise rwin32.lastSavedWindowsError()
+                    if (used_default_p and
+                        rffi.cast(lltype.Bool, used_default_p[0])):
+                        errorhandler('strict', 'mbcs', "invalid character",
+                                     s, 0, 0)
+                    result = buf.str(mbcssize)
+                    assert result is not None
+                    return result
+        finally:
+            if used_default_p:
+                lltype.free(used_default_p, flavor='raw')
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy unicode-utf8-py3: win32 fixes

Reply via email to