Author: Matti Picus <[email protected]>
Branch: unicode-utf8-py3
Changeset: r95135:8a769610ff91
Date: 2018-09-17 08:02 +0300
http://bitbucket.org/pypy/pypy/changeset/8a769610ff91/
Log: win32 fixes
diff --git a/TODO b/TODO
--- a/TODO
+++ b/TODO
@@ -6,3 +6,7 @@
* make sure we review all the places that call ord(unichr) to check for
ValueErrors
* rewrite unicodeobject.unicode_to_decimal_w to only use utf8 encoded bytes
* revisit why runicode import str_decode_utf_8_impl needed instead of runicode
import str_decode_utf_8
+* revisit all places where we do utf8.decode('utf-8'), they should work
directly with utf8
+ - rutf8.utf8_encode_mbcs
+ - unicodehelper.fsencode
+ - interp_posix.FileEncoder.as_unicode (used in win32)
diff --git a/pypy/interpreter/unicodehelper.py
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -77,16 +77,15 @@
def fsdecode(space, w_string):
from pypy.module._codecs import interp_codecs
state = space.fromcache(interp_codecs.CodecState)
+ errorhandler=state.decode_error_handler,
if _WIN32:
bytes = space.bytes_w(w_string)
- uni = str_decode_mbcs(bytes, 'strict',
- errorhandler=decode_error_handler(space),
+ uni = str_decode_mbcs(bytes, 'strict', True, errorhandler,
force_ignore=False)[0]
elif _MACOSX:
bytes = space.bytes_w(w_string)
uni = str_decode_utf8(
bytes, 'surrogateescape', final=True,
- errorhandler=state.decode_error_handler,
allow_surrogates=False)[0]
elif space.sys.filesystemencoding is None or state.codec_need_encodings:
# bootstrap check: if the filesystemencoding isn't initialized
@@ -109,15 +108,14 @@
from pypy.module._codecs import interp_codecs
state = space.fromcache(interp_codecs.CodecState)
if _WIN32:
- uni = space.utf8_w(w_uni)
- bytes = unicode_encode_mbcs(uni, len(uni), 'strict',
- errorhandler=encode_error_handler(space),
- force_replace=False)
+ errorhandler=state.encode_error_handler,
+ utf8 = space.utf8_w(w_uni)
+ bytes = utf8_encode_mbcs(utf8, 'strict', errorhandler)
elif _MACOSX:
- uni = space.utf8_w(w_uni)
+ utf8 = space.utf8_w(w_uni)
+ errorhandler=state.encode_error_handler,
bytes = unicodehelper.utf8_encode_utf_8(
- uni, 'surrogateescape',
- errorhandler=state.encode_error_handler,
+ utf8, 'surrogateescape',
allow_surrogates=False)
elif space.sys.filesystemencoding is None or state.codec_need_encodings:
# bootstrap check: if the filesystemencoding isn't initialized
@@ -314,16 +312,12 @@
if _WIN32:
def utf8_encode_mbcs(s, errors, errorhandler):
- s = s.decode('utf-8')
- if errorhandler is None:
- errorhandler = encode_error_handler(space)
- res = unicode_encode_mbcs(s, slen, errors, errorhandler)
+ res = rutf8.utf8_encode_mbcs(s, errors, errorhandler,
+ force_replace=False)
return res
def str_decode_mbcs(s, errors, final, errorhandler, force_ignore=True):
slen = len(s)
- if errorhandler is None:
- errorhandler = decode_error_handler(space)
res, size = runicode.str_decode_mbcs(s, slen, errors, final=final,
errorhandler=errorhandler,
force_ignore=force_ignore)
res_utf8 = runicode.unicode_encode_utf_8(res, len(res), 'strict')
diff --git a/pypy/module/posix/interp_posix.py
b/pypy/module/posix/interp_posix.py
--- a/pypy/module/posix/interp_posix.py
+++ b/pypy/module/posix/interp_posix.py
@@ -56,7 +56,7 @@
return self.space.fsencode_w(self.w_obj)
def as_unicode(self):
- return self.space.unicode0_w(self.w_obj)
+ return self.space.utf8_w(self.w_obj).decode('utf8')
class FileDecoder(object):
is_unicode = False
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -810,3 +810,63 @@
res.append_slice(s, start, end)
i = end
return res.build()
+
+# ____________________________________________________________
+# MBCS codecs for Windows
+
+if sys.platform == 'win32':
+ from rpython.rtyper.lltypesystem import lltype, rffi
+ from rpython.rlib.runicode import CP_ACP, BOOLP, WideCharToMultiByte
+ from rpython.rlib import rwin32
+
+ def utf8_encode_mbcs(s, errors, errorhandler,
+ force_replace=True):
+ # TODO: do the encoding without decoding utf8 -> unicode
+ uni = s.decode('utf8')
+ lgt = len(uni)
+ if not force_replace and errors not in ('strict', 'replace'):
+ msg = "mbcs encoding does not support errors='%s'" % errors
+ errorhandler('strict', 'mbcs', msg, s, 0, 0)
+
+ if lgt == 0:
+ return ''
+
+ if force_replace or errors == 'replace':
+ flags = 0
+ used_default_p = lltype.nullptr(BOOLP.TO)
+ else:
+ # strict
+ flags = rwin32.WC_NO_BEST_FIT_CHARS
+ used_default_p = lltype.malloc(BOOLP.TO, 1, flavor='raw')
+ used_default_p[0] = rffi.cast(rwin32.BOOL, False)
+
+ try:
+ with rffi.scoped_nonmoving_unicodebuffer(uni) as dataptr:
+ # first get the size of the result
+ mbcssize = WideCharToMultiByte(CP_ACP, flags,
+ dataptr, lgt, None, 0,
+ None, used_default_p)
+ if mbcssize == 0:
+ raise rwin32.lastSavedWindowsError()
+ # If we used a default char, then we failed!
+ if (used_default_p and
+ rffi.cast(lltype.Bool, used_default_p[0])):
+ errorhandler('strict', 'mbcs', "invalid character",
+ s, 0, 0)
+
+ with rffi.scoped_alloc_buffer(mbcssize) as buf:
+ # do the conversion
+ if WideCharToMultiByte(CP_ACP, flags,
+ dataptr, lgt, buf.raw, mbcssize,
+ None, used_default_p) == 0:
+ raise rwin32.lastSavedWindowsError()
+ if (used_default_p and
+ rffi.cast(lltype.Bool, used_default_p[0])):
+ errorhandler('strict', 'mbcs', "invalid character",
+ s, 0, 0)
+ result = buf.str(mbcssize)
+ assert result is not None
+ return result
+ finally:
+ if used_default_p:
+ lltype.free(used_default_p, flavor='raw')
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit