Author: Matti Picus <[email protected]>
Branch: unicode-utf8
Changeset: r95993:275eabb360c2
Date: 2019-02-13 00:51 +0200
http://bitbucket.org/pypy/pypy/changeset/275eabb360c2/
Log: backport changes to rpython from unicode-utf8-py3
diff --git a/rpython/rlib/_rsocket_rffi.py b/rpython/rlib/_rsocket_rffi.py
--- a/rpython/rlib/_rsocket_rffi.py
+++ b/rpython/rlib/_rsocket_rffi.py
@@ -1369,8 +1369,15 @@
return rwin32.FormatError(errno)
def socket_strerror_unicode(errno):
+ return rwin32.FormatErrorW(errno)[0]
+
+ def gai_strerror_unicode(errno):
+ return rwin32.FormatErrorW(errno)[0]
+
+ def socket_strerror_utf8(errno):
return rwin32.FormatErrorW(errno)
- def gai_strerror_unicode(errno):
+
+ def gai_strerror_utf8(errno):
return rwin32.FormatErrorW(errno)
# WinSock does not use a bitmask in select, and uses
@@ -1386,7 +1393,16 @@
def socket_strerror_unicode(errno):
return socket_strerror_str(errno).decode('latin-1')
+
def gai_strerror_unicode(errno):
return gai_strerror_str(errno).decode('latin-1')
+ def socket_strerror_utf8(errno):
+ msg = socket_strerror_str(errno)
+ return msg, len(msg)
+
+ def gai_strerror_utf8(errno):
+ msg = gai_strerror_str(errno)
+ return msg, len(msg)
+
MAX_FD_SIZE = FD_SETSIZE
diff --git a/rpython/rlib/rdynload.py b/rpython/rlib/rdynload.py
--- a/rpython/rlib/rdynload.py
+++ b/rpython/rlib/rdynload.py
@@ -228,18 +228,16 @@
res = rwin32.LoadLibrary(name)
if not res:
err = rwin32.GetLastError_saved()
- ustr = rwin32.FormatErrorW(err)
- # DLOpenError unicode msg breaks translation of cpyext
create_extension_module
- raise DLOpenError(ustr.encode('utf-8'))
+ ustr, lgt = rwin32.FormatErrorW(err)
+ raise DLOpenError(ustr)
return res
def dlopenex(name):
res = rwin32.LoadLibraryExA(name)
if not res:
err = rwin32.GetLastError_saved()
- ustr = rwin32.FormatErrorW(err)
- # DLOpenError unicode msg breaks translation of cpyext
create_extension_module
- raise DLOpenError(ustr.encode('utf-8'))
+ ustr, lgt = rwin32.FormatErrorW(err)
+ raise DLOpenError(ustr)
return res
def dlopenU(name, mode=-1):
@@ -247,9 +245,8 @@
res = rwin32.LoadLibraryW(name)
if not res:
err = rwin32.GetLastError_saved()
- ustr = rwin32.FormatErrorW(err)
- # DLOpenError unicode msg breaks translation of cpyext
create_extension_module
- raise DLOpenError(ustr.encode('utf-8'))
+ ustr, lgt = rwin32.FormatErrorW(err)
+ raise DLOpenError(ustr)
return res
def dlclose(handle):
diff --git a/rpython/rlib/rpoll.py b/rpython/rlib/rpoll.py
--- a/rpython/rlib/rpoll.py
+++ b/rpython/rlib/rpoll.py
@@ -30,6 +30,8 @@
return _c.socket_strerror_str(self.errno)
def get_msg_unicode(self):
return _c.socket_strerror_unicode(self.errno)
+ def get_msg_utf8(self):
+ return _c.socket_strerror_utf8(self.errno)
class SelectError(Exception):
def __init__(self, errno):
@@ -38,6 +40,8 @@
return _c.socket_strerror_str(self.errno)
def get_msg_unicode(self):
return _c.socket_strerror_unicode(self.errno)
+ def get_msg_utf8(self):
+ return _c.socket_strerror_utf8(self.errno)
# ____________________________________________________________
# poll() for POSIX systems
diff --git a/rpython/rlib/rsocket.py b/rpython/rlib/rsocket.py
--- a/rpython/rlib/rsocket.py
+++ b/rpython/rlib/rsocket.py
@@ -1301,6 +1301,9 @@
return ''
def get_msg_unicode(self):
return self.get_msg().decode('latin-1')
+ def get_msg_utf8(self):
+ msg = self.get_msg()
+ return msg, len(msg)
def __str__(self):
return self.get_msg()
@@ -1319,6 +1322,8 @@
return _c.socket_strerror_str(self.errno)
def get_msg_unicode(self):
return _c.socket_strerror_unicode(self.errno)
+ def get_msg_utf8(self):
+ return _c.socket_strerror_utf8(self.errno)
def last_error():
return CSocketError(_c.geterrno())
@@ -1329,6 +1334,8 @@
return _c.gai_strerror_str(self.errno)
def get_msg_unicode(self):
return _c.gai_strerror_unicode(self.errno)
+ def get_msg_utf8(self):
+ return _c.gai_strerror_utf8(self.errno)
class HSocketError(SocketError):
applevelerrcls = 'herror'
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -158,18 +158,19 @@
def codepoint_at_pos(code, pos):
""" Give a codepoint in code at pos - assumes valid utf8, no checking!
"""
+ lgt = len(code)
ordch1 = ord(code[pos])
- if ordch1 <= 0x7F:
+ if ordch1 <= 0x7F or pos +1 >= lgt:
return ordch1
ordch2 = ord(code[pos+1])
- if ordch1 <= 0xDF:
+ if ordch1 <= 0xDF or pos +2 >= lgt:
# 110yyyyy 10zzzzzz -> 00000000 00000yyy yyzzzzzz
return (ordch1 << 6) + ordch2 - (
(0xC0 << 6) + 0x80 )
ordch3 = ord(code[pos+2])
- if ordch1 <= 0xEF:
+ if ordch1 <= 0xEF or pos + 3 >= lgt:
# 1110xxxx 10yyyyyy 10zzzzzz -> 00000000 xxxxyyyy yyzzzzzz
return (ordch1 << 12) + (ordch2 << 6) + ordch3 - (
(0xE0 << 12) + (0x80 << 6) + 0x80 )
@@ -767,6 +768,9 @@
if ordch1 <= 0x7F:
self._pos = pos + 1
return ordch1
+ if pos + 1 >= len(code):
+ self._pos = pos + 1
+ return ordch1
ordch2 = ord(code[pos+1])
if ordch1 <= 0xDF:
@@ -818,3 +822,63 @@
res.append_slice(s, start, end)
i = end
return res.build()
+
+# ____________________________________________________________
+# MBCS codecs for Windows
+
+if sys.platform == 'win32':
+ from rpython.rtyper.lltypesystem import lltype, rffi
+ from rpython.rlib.runicode import CP_ACP, BOOLP, WideCharToMultiByte
+ from rpython.rlib import rwin32
+
+ def utf8_encode_mbcs(s, errors, errorhandler,
+ force_replace=True):
+ # TODO: do the encoding without decoding utf8 -> unicode
+ uni = s.decode('utf8')
+ lgt = len(uni)
+ if not force_replace and errors not in ('strict', 'replace'):
+ msg = "mbcs encoding does not support errors='%s'" % errors
+ errorhandler('strict', 'mbcs', msg, s, 0, 0)
+
+ if lgt == 0:
+ return ''
+
+ if force_replace or errors == 'replace':
+ flags = 0
+ used_default_p = lltype.nullptr(BOOLP.TO)
+ else:
+ # strict
+ flags = rwin32.WC_NO_BEST_FIT_CHARS
+ used_default_p = lltype.malloc(BOOLP.TO, 1, flavor='raw')
+ used_default_p[0] = rffi.cast(rwin32.BOOL, False)
+
+ try:
+ with rffi.scoped_nonmoving_unicodebuffer(uni) as dataptr:
+ # first get the size of the result
+ mbcssize = WideCharToMultiByte(CP_ACP, flags,
+ dataptr, lgt, None, 0,
+ None, used_default_p)
+ if mbcssize == 0:
+ raise rwin32.lastSavedWindowsError()
+ # If we used a default char, then we failed!
+ if (used_default_p and
+ rffi.cast(lltype.Bool, used_default_p[0])):
+ errorhandler('strict', 'mbcs', "invalid character",
+ s, 0, 0)
+
+ with rffi.scoped_alloc_buffer(mbcssize) as buf:
+ # do the conversion
+ if WideCharToMultiByte(CP_ACP, flags,
+ dataptr, lgt, buf.raw, mbcssize,
+ None, used_default_p) == 0:
+ raise rwin32.lastSavedWindowsError()
+ if (used_default_p and
+ rffi.cast(lltype.Bool, used_default_p[0])):
+ errorhandler('strict', 'mbcs', "invalid character",
+ s, 0, 0)
+ result = buf.str(mbcssize)
+ assert result is not None
+ return result
+ finally:
+ if used_default_p:
+ lltype.free(used_default_p, flavor='raw')
diff --git a/rpython/rlib/rwin32.py b/rpython/rlib/rwin32.py
--- a/rpython/rlib/rwin32.py
+++ b/rpython/rlib/rwin32.py
@@ -269,6 +269,9 @@
def FormatError(code):
return llimpl_FormatError(code)
def FormatErrorW(code):
+ """
+ returns utf8, n_codepoints
+ """
return llimpl_FormatErrorW(code)
def llimpl_FormatError(code):
@@ -303,7 +306,7 @@
return result
def llimpl_FormatErrorW(code):
- "Return a unicode message corresponding to the given Windows error
code."
+ "Return a utf8-encoded msg and its length"
buf = lltype.malloc(rffi.CWCHARPP.TO, 1, flavor='raw')
buf[0] = lltype.nullptr(rffi.CWCHARP.TO)
try:
@@ -324,9 +327,10 @@
buflen -= 1
if buflen <= 0:
- result = u'Windows Error %d' % (code,)
+ msg = 'Windows Error %d' % (code,)
+ result = msg, len(msg)
else:
- result = rffi.wcharpsize2unicode(s_buf, buflen)
+ result = rffi.wcharpsize2utf8(s_buf, buflen), buflen
finally:
LocalFree(rffi.cast(rffi.VOIDP, buf[0]))
lltype.free(buf, flavor='raw')
diff --git a/rpython/rlib/test/test_rwin32.py b/rpython/rlib/test/test_rwin32.py
--- a/rpython/rlib/test/test_rwin32.py
+++ b/rpython/rlib/test/test_rwin32.py
@@ -90,9 +90,9 @@
assert '%2' in msg
def test_formaterror_unicode():
- msg = rwin32.FormatErrorW(34)
- assert type(msg) is unicode
- assert u'%2' in msg
+ msg, lgt = rwin32.FormatErrorW(34)
+ assert type(msg) is str
+ assert '%2' in msg
def test_loadlibraryA():
# test0 can be loaded alone, but test1 requires the modified search path
diff --git a/rpython/rtyper/lltypesystem/rffi.py
b/rpython/rtyper/lltypesystem/rffi.py
--- a/rpython/rtyper/lltypesystem/rffi.py
+++ b/rpython/rtyper/lltypesystem/rffi.py
@@ -1029,7 +1029,7 @@
s = StringBuilder(size)
for i in range(size):
- rutf8.unichr_as_utf8_append(s, ord(w[i]))
+ rutf8.unichr_as_utf8_append(s, ord(w[i]), True)
return s.build()
def wcharp2utf8(w):
diff --git a/rpython/rtyper/lltypesystem/test/test_rffi.py
b/rpython/rtyper/lltypesystem/test/test_rffi.py
--- a/rpython/rtyper/lltypesystem/test/test_rffi.py
+++ b/rpython/rtyper/lltypesystem/test/test_rffi.py
@@ -916,3 +916,8 @@
assert buf[1] == 'a'
assert buf[2] == 'r'
assert buf[3] == '\x00'
+
+def test_wcharp2utf8n():
+ w = 'hello\x00\x00\x00\x00'
+ u, i = wcharp2utf8n(w, len(w))
+ assert i == len('hello')
diff --git a/rpython/tool/leakfinder.py b/rpython/tool/leakfinder.py
--- a/rpython/tool/leakfinder.py
+++ b/rpython/tool/leakfinder.py
@@ -1,5 +1,10 @@
import sys, gc
-import cStringIO
+try:
+ import cStringIO
+except ImportError as e:
+ if sys.version_info.major > 2:
+ raise RuntimeError('use python 2 to run tests')
+ raise
import traceback
# Track allocations to detect memory leaks.
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit