[pypy-commit] pypy unicode-utf8: backport changes to rpython from unicode-utf8-py3

mattip Wed, 13 Feb 2019 00:21:01 -0800

Author: Matti Picus <[email protected]>
Branch: unicode-utf8
Changeset: r95993:275eabb360c2
Date: 2019-02-13 00:51 +0200
http://bitbucket.org/pypy/pypy/changeset/275eabb360c2/


Log:    backport changes to rpython from unicode-utf8-py3

diff --git a/rpython/rlib/_rsocket_rffi.py b/rpython/rlib/_rsocket_rffi.py
--- a/rpython/rlib/_rsocket_rffi.py
+++ b/rpython/rlib/_rsocket_rffi.py
@@ -1369,8 +1369,15 @@
         return rwin32.FormatError(errno)
 
     def socket_strerror_unicode(errno):
+        return rwin32.FormatErrorW(errno)[0]
+
+    def gai_strerror_unicode(errno):
+        return rwin32.FormatErrorW(errno)[0]
+
+    def socket_strerror_utf8(errno):
         return rwin32.FormatErrorW(errno)
-    def gai_strerror_unicode(errno):
+
+    def gai_strerror_utf8(errno):
         return rwin32.FormatErrorW(errno)
 
     # WinSock does not use a bitmask in select, and uses
@@ -1386,7 +1393,16 @@
 
     def socket_strerror_unicode(errno):
         return socket_strerror_str(errno).decode('latin-1')
+
     def gai_strerror_unicode(errno):
         return gai_strerror_str(errno).decode('latin-1')
 
+    def socket_strerror_utf8(errno):
+        msg = socket_strerror_str(errno)
+        return msg, len(msg)
+
+    def gai_strerror_utf8(errno):
+        msg = gai_strerror_str(errno)
+        return msg, len(msg)
+
     MAX_FD_SIZE = FD_SETSIZE
diff --git a/rpython/rlib/rdynload.py b/rpython/rlib/rdynload.py
--- a/rpython/rlib/rdynload.py
+++ b/rpython/rlib/rdynload.py
@@ -228,18 +228,16 @@
         res = rwin32.LoadLibrary(name)
         if not res:
             err = rwin32.GetLastError_saved()
-            ustr = rwin32.FormatErrorW(err)
-            # DLOpenError unicode msg breaks translation of cpyext 
create_extension_module
-            raise DLOpenError(ustr.encode('utf-8'))
+            ustr, lgt = rwin32.FormatErrorW(err)
+            raise DLOpenError(ustr)
         return res
 
     def dlopenex(name):
         res = rwin32.LoadLibraryExA(name)
         if not res:
             err = rwin32.GetLastError_saved()
-            ustr = rwin32.FormatErrorW(err)
-            # DLOpenError unicode msg breaks translation of cpyext 
create_extension_module
-            raise DLOpenError(ustr.encode('utf-8'))
+            ustr, lgt = rwin32.FormatErrorW(err)
+            raise DLOpenError(ustr)
         return res
 
     def dlopenU(name, mode=-1):
@@ -247,9 +245,8 @@
         res = rwin32.LoadLibraryW(name)
         if not res:
             err = rwin32.GetLastError_saved()
-            ustr = rwin32.FormatErrorW(err)
-            # DLOpenError unicode msg breaks translation of cpyext 
create_extension_module
-            raise DLOpenError(ustr.encode('utf-8'))
+            ustr, lgt = rwin32.FormatErrorW(err)
+            raise DLOpenError(ustr)
         return res
 
     def dlclose(handle):
diff --git a/rpython/rlib/rpoll.py b/rpython/rlib/rpoll.py
--- a/rpython/rlib/rpoll.py
+++ b/rpython/rlib/rpoll.py
@@ -30,6 +30,8 @@
         return _c.socket_strerror_str(self.errno)
     def get_msg_unicode(self):
         return _c.socket_strerror_unicode(self.errno)
+    def get_msg_utf8(self):
+        return _c.socket_strerror_utf8(self.errno)
 
 class SelectError(Exception):
     def __init__(self, errno):
@@ -38,6 +40,8 @@
         return _c.socket_strerror_str(self.errno)
     def get_msg_unicode(self):
         return _c.socket_strerror_unicode(self.errno)
+    def get_msg_utf8(self):
+        return _c.socket_strerror_utf8(self.errno)
 
 # ____________________________________________________________
 # poll() for POSIX systems
diff --git a/rpython/rlib/rsocket.py b/rpython/rlib/rsocket.py
--- a/rpython/rlib/rsocket.py
+++ b/rpython/rlib/rsocket.py
@@ -1301,6 +1301,9 @@
         return ''
     def get_msg_unicode(self):
         return self.get_msg().decode('latin-1')
+    def get_msg_utf8(self):
+        msg = self.get_msg()
+        return msg, len(msg)
     def __str__(self):
         return self.get_msg()
 
@@ -1319,6 +1322,8 @@
         return _c.socket_strerror_str(self.errno)
     def get_msg_unicode(self):
         return _c.socket_strerror_unicode(self.errno)
+    def get_msg_utf8(self):
+        return _c.socket_strerror_utf8(self.errno)
 
 def last_error():
     return CSocketError(_c.geterrno())
@@ -1329,6 +1334,8 @@
         return _c.gai_strerror_str(self.errno)
     def get_msg_unicode(self):
         return _c.gai_strerror_unicode(self.errno)
+    def get_msg_utf8(self):
+        return _c.gai_strerror_utf8(self.errno)
 
 class HSocketError(SocketError):
     applevelerrcls = 'herror'
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -158,18 +158,19 @@
 def codepoint_at_pos(code, pos):
     """ Give a codepoint in code at pos - assumes valid utf8, no checking!
     """
+    lgt = len(code)
     ordch1 = ord(code[pos])
-    if ordch1 <= 0x7F:
+    if ordch1 <= 0x7F or pos +1 >= lgt:
         return ordch1
 
     ordch2 = ord(code[pos+1])
-    if ordch1 <= 0xDF:
+    if ordch1 <= 0xDF or pos +2 >= lgt:
         # 110yyyyy 10zzzzzz -> 00000000 00000yyy yyzzzzzz
         return (ordch1 << 6) + ordch2 - (
                (0xC0   << 6) + 0x80     )
 
     ordch3 = ord(code[pos+2])
-    if ordch1 <= 0xEF:
+    if ordch1 <= 0xEF or pos + 3 >= lgt:
         # 1110xxxx 10yyyyyy 10zzzzzz -> 00000000 xxxxyyyy yyzzzzzz
         return (ordch1 << 12) + (ordch2 << 6) + ordch3 - (
                (0xE0   << 12) + (0x80   << 6) + 0x80     )
@@ -767,6 +768,9 @@
         if ordch1 <= 0x7F:
             self._pos = pos + 1
             return ordch1
+        if pos + 1 >= len(code):
+            self._pos = pos + 1
+            return ordch1
 
         ordch2 = ord(code[pos+1])
         if ordch1 <= 0xDF:
@@ -818,3 +822,63 @@
             res.append_slice(s, start, end)
             i = end
     return res.build()
+
+# ____________________________________________________________
+# MBCS codecs for Windows
+
+if sys.platform == 'win32':
+    from rpython.rtyper.lltypesystem import lltype, rffi
+    from rpython.rlib.runicode import CP_ACP, BOOLP, WideCharToMultiByte
+    from rpython.rlib import rwin32
+
+    def utf8_encode_mbcs(s, errors, errorhandler,
+                            force_replace=True):
+        # TODO: do the encoding without decoding utf8 -> unicode
+        uni = s.decode('utf8')
+        lgt = len(uni)
+        if not force_replace and errors not in ('strict', 'replace'):
+            msg = "mbcs encoding does not support errors='%s'" % errors
+            errorhandler('strict', 'mbcs', msg, s, 0, 0)
+
+        if lgt == 0:
+            return ''
+
+        if force_replace or errors == 'replace':
+            flags = 0
+            used_default_p = lltype.nullptr(BOOLP.TO)
+        else:
+            # strict
+            flags = rwin32.WC_NO_BEST_FIT_CHARS
+            used_default_p = lltype.malloc(BOOLP.TO, 1, flavor='raw')
+            used_default_p[0] = rffi.cast(rwin32.BOOL, False)
+
+        try:
+            with rffi.scoped_nonmoving_unicodebuffer(uni) as dataptr:
+                # first get the size of the result
+                mbcssize = WideCharToMultiByte(CP_ACP, flags,
+                                               dataptr, lgt, None, 0,
+                                               None, used_default_p)
+                if mbcssize == 0:
+                    raise rwin32.lastSavedWindowsError()
+                # If we used a default char, then we failed!
+                if (used_default_p and
+                    rffi.cast(lltype.Bool, used_default_p[0])):
+                    errorhandler('strict', 'mbcs', "invalid character",
+                                 s, 0, 0)
+
+                with rffi.scoped_alloc_buffer(mbcssize) as buf:
+                    # do the conversion
+                    if WideCharToMultiByte(CP_ACP, flags,
+                                           dataptr, lgt, buf.raw, mbcssize,
+                                           None, used_default_p) == 0:
+                        raise rwin32.lastSavedWindowsError()
+                    if (used_default_p and
+                        rffi.cast(lltype.Bool, used_default_p[0])):
+                        errorhandler('strict', 'mbcs', "invalid character",
+                                     s, 0, 0)
+                    result = buf.str(mbcssize)
+                    assert result is not None
+                    return result
+        finally:
+            if used_default_p:
+                lltype.free(used_default_p, flavor='raw')
diff --git a/rpython/rlib/rwin32.py b/rpython/rlib/rwin32.py
--- a/rpython/rlib/rwin32.py
+++ b/rpython/rlib/rwin32.py
@@ -269,6 +269,9 @@
     def FormatError(code):
         return llimpl_FormatError(code)
     def FormatErrorW(code):
+        """
+        returns utf8, n_codepoints
+        """
         return llimpl_FormatErrorW(code)
 
     def llimpl_FormatError(code):
@@ -303,7 +306,7 @@
         return result
 
     def llimpl_FormatErrorW(code):
-        "Return a unicode message corresponding to the given Windows error 
code."
+        "Return a utf8-encoded msg and its length"
         buf = lltype.malloc(rffi.CWCHARPP.TO, 1, flavor='raw')
         buf[0] = lltype.nullptr(rffi.CWCHARP.TO)
         try:
@@ -324,9 +327,10 @@
                 buflen -= 1
 
             if buflen <= 0:
-                result = u'Windows Error %d' % (code,)
+                msg = 'Windows Error %d' % (code,)
+                result = msg, len(msg)
             else:
-                result = rffi.wcharpsize2unicode(s_buf, buflen)
+                result = rffi.wcharpsize2utf8(s_buf, buflen), buflen
         finally:
             LocalFree(rffi.cast(rffi.VOIDP, buf[0]))
             lltype.free(buf, flavor='raw')
diff --git a/rpython/rlib/test/test_rwin32.py b/rpython/rlib/test/test_rwin32.py
--- a/rpython/rlib/test/test_rwin32.py
+++ b/rpython/rlib/test/test_rwin32.py
@@ -90,9 +90,9 @@
     assert '%2' in msg
 
 def test_formaterror_unicode():
-    msg = rwin32.FormatErrorW(34)
-    assert type(msg) is unicode
-    assert u'%2' in msg
+    msg, lgt = rwin32.FormatErrorW(34)
+    assert type(msg) is str
+    assert '%2' in msg
 
 def test_loadlibraryA():
     # test0 can be loaded alone, but test1 requires the modified search path
diff --git a/rpython/rtyper/lltypesystem/rffi.py 
b/rpython/rtyper/lltypesystem/rffi.py
--- a/rpython/rtyper/lltypesystem/rffi.py
+++ b/rpython/rtyper/lltypesystem/rffi.py
@@ -1029,7 +1029,7 @@
 
     s = StringBuilder(size)
     for i in range(size):
-        rutf8.unichr_as_utf8_append(s, ord(w[i]))
+        rutf8.unichr_as_utf8_append(s, ord(w[i]), True)
     return s.build()
 
 def wcharp2utf8(w):
diff --git a/rpython/rtyper/lltypesystem/test/test_rffi.py 
b/rpython/rtyper/lltypesystem/test/test_rffi.py
--- a/rpython/rtyper/lltypesystem/test/test_rffi.py
+++ b/rpython/rtyper/lltypesystem/test/test_rffi.py
@@ -916,3 +916,8 @@
         assert buf[1] == 'a'
         assert buf[2] == 'r'
         assert buf[3] == '\x00'
+
+def test_wcharp2utf8n():
+    w = 'hello\x00\x00\x00\x00'
+    u, i = wcharp2utf8n(w, len(w))
+    assert i == len('hello')
diff --git a/rpython/tool/leakfinder.py b/rpython/tool/leakfinder.py
--- a/rpython/tool/leakfinder.py
+++ b/rpython/tool/leakfinder.py
@@ -1,5 +1,10 @@
 import sys, gc
-import cStringIO
+try:
+    import cStringIO
+except ImportError as e:
+    if sys.version_info.major > 2:
+        raise RuntimeError('use python 2 to run tests')
+    raise
 import traceback
 
 # Track allocations to detect memory leaks.
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy unicode-utf8: backport changes to rpython from unicode-utf8-py3

Reply via email to