Author: Matti Picus <matti.pi...@gmail.com> Branch: unicode-utf8 Changeset: r95993:275eabb360c2 Date: 2019-02-13 00:51 +0200 http://bitbucket.org/pypy/pypy/changeset/275eabb360c2/
Log: backport changes to rpython from unicode-utf8-py3 diff --git a/rpython/rlib/_rsocket_rffi.py b/rpython/rlib/_rsocket_rffi.py --- a/rpython/rlib/_rsocket_rffi.py +++ b/rpython/rlib/_rsocket_rffi.py @@ -1369,8 +1369,15 @@ return rwin32.FormatError(errno) def socket_strerror_unicode(errno): + return rwin32.FormatErrorW(errno)[0] + + def gai_strerror_unicode(errno): + return rwin32.FormatErrorW(errno)[0] + + def socket_strerror_utf8(errno): return rwin32.FormatErrorW(errno) - def gai_strerror_unicode(errno): + + def gai_strerror_utf8(errno): return rwin32.FormatErrorW(errno) # WinSock does not use a bitmask in select, and uses @@ -1386,7 +1393,16 @@ def socket_strerror_unicode(errno): return socket_strerror_str(errno).decode('latin-1') + def gai_strerror_unicode(errno): return gai_strerror_str(errno).decode('latin-1') + def socket_strerror_utf8(errno): + msg = socket_strerror_str(errno) + return msg, len(msg) + + def gai_strerror_utf8(errno): + msg = gai_strerror_str(errno) + return msg, len(msg) + MAX_FD_SIZE = FD_SETSIZE diff --git a/rpython/rlib/rdynload.py b/rpython/rlib/rdynload.py --- a/rpython/rlib/rdynload.py +++ b/rpython/rlib/rdynload.py @@ -228,18 +228,16 @@ res = rwin32.LoadLibrary(name) if not res: err = rwin32.GetLastError_saved() - ustr = rwin32.FormatErrorW(err) - # DLOpenError unicode msg breaks translation of cpyext create_extension_module - raise DLOpenError(ustr.encode('utf-8')) + ustr, lgt = rwin32.FormatErrorW(err) + raise DLOpenError(ustr) return res def dlopenex(name): res = rwin32.LoadLibraryExA(name) if not res: err = rwin32.GetLastError_saved() - ustr = rwin32.FormatErrorW(err) - # DLOpenError unicode msg breaks translation of cpyext create_extension_module - raise DLOpenError(ustr.encode('utf-8')) + ustr, lgt = rwin32.FormatErrorW(err) + raise DLOpenError(ustr) return res def dlopenU(name, mode=-1): @@ -247,9 +245,8 @@ res = rwin32.LoadLibraryW(name) if not res: err = rwin32.GetLastError_saved() - ustr = rwin32.FormatErrorW(err) - # DLOpenError unicode msg breaks translation of cpyext create_extension_module - raise DLOpenError(ustr.encode('utf-8')) + ustr, lgt = rwin32.FormatErrorW(err) + raise DLOpenError(ustr) return res def dlclose(handle): diff --git a/rpython/rlib/rpoll.py b/rpython/rlib/rpoll.py --- a/rpython/rlib/rpoll.py +++ b/rpython/rlib/rpoll.py @@ -30,6 +30,8 @@ return _c.socket_strerror_str(self.errno) def get_msg_unicode(self): return _c.socket_strerror_unicode(self.errno) + def get_msg_utf8(self): + return _c.socket_strerror_utf8(self.errno) class SelectError(Exception): def __init__(self, errno): @@ -38,6 +40,8 @@ return _c.socket_strerror_str(self.errno) def get_msg_unicode(self): return _c.socket_strerror_unicode(self.errno) + def get_msg_utf8(self): + return _c.socket_strerror_utf8(self.errno) # ____________________________________________________________ # poll() for POSIX systems diff --git a/rpython/rlib/rsocket.py b/rpython/rlib/rsocket.py --- a/rpython/rlib/rsocket.py +++ b/rpython/rlib/rsocket.py @@ -1301,6 +1301,9 @@ return '' def get_msg_unicode(self): return self.get_msg().decode('latin-1') + def get_msg_utf8(self): + msg = self.get_msg() + return msg, len(msg) def __str__(self): return self.get_msg() @@ -1319,6 +1322,8 @@ return _c.socket_strerror_str(self.errno) def get_msg_unicode(self): return _c.socket_strerror_unicode(self.errno) + def get_msg_utf8(self): + return _c.socket_strerror_utf8(self.errno) def last_error(): return CSocketError(_c.geterrno()) @@ -1329,6 +1334,8 @@ return _c.gai_strerror_str(self.errno) def get_msg_unicode(self): return _c.gai_strerror_unicode(self.errno) + def get_msg_utf8(self): + return _c.gai_strerror_utf8(self.errno) class HSocketError(SocketError): applevelerrcls = 'herror' diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py --- a/rpython/rlib/rutf8.py +++ b/rpython/rlib/rutf8.py @@ -158,18 +158,19 @@ def codepoint_at_pos(code, pos): """ Give a codepoint in code at pos - assumes valid utf8, no checking! """ + lgt = len(code) ordch1 = ord(code[pos]) - if ordch1 <= 0x7F: + if ordch1 <= 0x7F or pos +1 >= lgt: return ordch1 ordch2 = ord(code[pos+1]) - if ordch1 <= 0xDF: + if ordch1 <= 0xDF or pos +2 >= lgt: # 110yyyyy 10zzzzzz -> 00000000 00000yyy yyzzzzzz return (ordch1 << 6) + ordch2 - ( (0xC0 << 6) + 0x80 ) ordch3 = ord(code[pos+2]) - if ordch1 <= 0xEF: + if ordch1 <= 0xEF or pos + 3 >= lgt: # 1110xxxx 10yyyyyy 10zzzzzz -> 00000000 xxxxyyyy yyzzzzzz return (ordch1 << 12) + (ordch2 << 6) + ordch3 - ( (0xE0 << 12) + (0x80 << 6) + 0x80 ) @@ -767,6 +768,9 @@ if ordch1 <= 0x7F: self._pos = pos + 1 return ordch1 + if pos + 1 >= len(code): + self._pos = pos + 1 + return ordch1 ordch2 = ord(code[pos+1]) if ordch1 <= 0xDF: @@ -818,3 +822,63 @@ res.append_slice(s, start, end) i = end return res.build() + +# ____________________________________________________________ +# MBCS codecs for Windows + +if sys.platform == 'win32': + from rpython.rtyper.lltypesystem import lltype, rffi + from rpython.rlib.runicode import CP_ACP, BOOLP, WideCharToMultiByte + from rpython.rlib import rwin32 + + def utf8_encode_mbcs(s, errors, errorhandler, + force_replace=True): + # TODO: do the encoding without decoding utf8 -> unicode + uni = s.decode('utf8') + lgt = len(uni) + if not force_replace and errors not in ('strict', 'replace'): + msg = "mbcs encoding does not support errors='%s'" % errors + errorhandler('strict', 'mbcs', msg, s, 0, 0) + + if lgt == 0: + return '' + + if force_replace or errors == 'replace': + flags = 0 + used_default_p = lltype.nullptr(BOOLP.TO) + else: + # strict + flags = rwin32.WC_NO_BEST_FIT_CHARS + used_default_p = lltype.malloc(BOOLP.TO, 1, flavor='raw') + used_default_p[0] = rffi.cast(rwin32.BOOL, False) + + try: + with rffi.scoped_nonmoving_unicodebuffer(uni) as dataptr: + # first get the size of the result + mbcssize = WideCharToMultiByte(CP_ACP, flags, + dataptr, lgt, None, 0, + None, used_default_p) + if mbcssize == 0: + raise rwin32.lastSavedWindowsError() + # If we used a default char, then we failed! + if (used_default_p and + rffi.cast(lltype.Bool, used_default_p[0])): + errorhandler('strict', 'mbcs', "invalid character", + s, 0, 0) + + with rffi.scoped_alloc_buffer(mbcssize) as buf: + # do the conversion + if WideCharToMultiByte(CP_ACP, flags, + dataptr, lgt, buf.raw, mbcssize, + None, used_default_p) == 0: + raise rwin32.lastSavedWindowsError() + if (used_default_p and + rffi.cast(lltype.Bool, used_default_p[0])): + errorhandler('strict', 'mbcs', "invalid character", + s, 0, 0) + result = buf.str(mbcssize) + assert result is not None + return result + finally: + if used_default_p: + lltype.free(used_default_p, flavor='raw') diff --git a/rpython/rlib/rwin32.py b/rpython/rlib/rwin32.py --- a/rpython/rlib/rwin32.py +++ b/rpython/rlib/rwin32.py @@ -269,6 +269,9 @@ def FormatError(code): return llimpl_FormatError(code) def FormatErrorW(code): + """ + returns utf8, n_codepoints + """ return llimpl_FormatErrorW(code) def llimpl_FormatError(code): @@ -303,7 +306,7 @@ return result def llimpl_FormatErrorW(code): - "Return a unicode message corresponding to the given Windows error code." + "Return a utf8-encoded msg and its length" buf = lltype.malloc(rffi.CWCHARPP.TO, 1, flavor='raw') buf[0] = lltype.nullptr(rffi.CWCHARP.TO) try: @@ -324,9 +327,10 @@ buflen -= 1 if buflen <= 0: - result = u'Windows Error %d' % (code,) + msg = 'Windows Error %d' % (code,) + result = msg, len(msg) else: - result = rffi.wcharpsize2unicode(s_buf, buflen) + result = rffi.wcharpsize2utf8(s_buf, buflen), buflen finally: LocalFree(rffi.cast(rffi.VOIDP, buf[0])) lltype.free(buf, flavor='raw') diff --git a/rpython/rlib/test/test_rwin32.py b/rpython/rlib/test/test_rwin32.py --- a/rpython/rlib/test/test_rwin32.py +++ b/rpython/rlib/test/test_rwin32.py @@ -90,9 +90,9 @@ assert '%2' in msg def test_formaterror_unicode(): - msg = rwin32.FormatErrorW(34) - assert type(msg) is unicode - assert u'%2' in msg + msg, lgt = rwin32.FormatErrorW(34) + assert type(msg) is str + assert '%2' in msg def test_loadlibraryA(): # test0 can be loaded alone, but test1 requires the modified search path diff --git a/rpython/rtyper/lltypesystem/rffi.py b/rpython/rtyper/lltypesystem/rffi.py --- a/rpython/rtyper/lltypesystem/rffi.py +++ b/rpython/rtyper/lltypesystem/rffi.py @@ -1029,7 +1029,7 @@ s = StringBuilder(size) for i in range(size): - rutf8.unichr_as_utf8_append(s, ord(w[i])) + rutf8.unichr_as_utf8_append(s, ord(w[i]), True) return s.build() def wcharp2utf8(w): diff --git a/rpython/rtyper/lltypesystem/test/test_rffi.py b/rpython/rtyper/lltypesystem/test/test_rffi.py --- a/rpython/rtyper/lltypesystem/test/test_rffi.py +++ b/rpython/rtyper/lltypesystem/test/test_rffi.py @@ -916,3 +916,8 @@ assert buf[1] == 'a' assert buf[2] == 'r' assert buf[3] == '\x00' + +def test_wcharp2utf8n(): + w = 'hello\x00\x00\x00\x00' + u, i = wcharp2utf8n(w, len(w)) + assert i == len('hello') diff --git a/rpython/tool/leakfinder.py b/rpython/tool/leakfinder.py --- a/rpython/tool/leakfinder.py +++ b/rpython/tool/leakfinder.py @@ -1,5 +1,10 @@ import sys, gc -import cStringIO +try: + import cStringIO +except ImportError as e: + if sys.version_info.major > 2: + raise RuntimeError('use python 2 to run tests') + raise import traceback # Track allocations to detect memory leaks. _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit