Author: Ronan Lamy <ronan.l...@gmail.com>
Branch: unicode-utf8-test
Changeset: r93344:1665df77270e
Date: 2017-12-10 05:27 +0000
http://bitbucket.org/pypy/pypy/changeset/1665df77270e/

Log:    hg merge unicode-utf8

diff --git a/TODO b/TODO
--- a/TODO
+++ b/TODO
@@ -12,3 +12,4 @@
 * improve performance of splitlines
 
 * fix _pypyjson to not use a wrapped dict when decoding an object
+* make sure we review all the places that call ord(unichr) to check for 
ValueErrors
\ No newline at end of file
diff --git a/pypy/interpreter/unicodehelper.py 
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -1098,22 +1098,19 @@
         elif ch >= 0xE000 or allow_surrogates:
             _STORECHAR(result, ch, byteorder)
         else:
-            ru, newindex = errorhandler(errors, public_encoding_name,
-                                   'surrogates not allowed',
-                                    s, pos-1, pos)
-            for j in range(newindex - index):
-                pos = rutf8.next_codepoint_pos(s, pos)
-            j = 0
-            while j < len(ru):
-                ch = rutf8.codepoint_at_pos(ru, j)
-                if ord(ch) < 0xD800:
-                    _STORECHAR(result, ord(ch), byteorder)
+            res_8, newindex = errorhandler(
+                errors, public_encoding_name, 'surrogates not allowed',
+                s, pos - 1, pos)
+            for cp in rutf8.Utf8StringIterator(res_8):
+                if cp < 0xD800:
+                    _STORECHAR(result, cp, byteorder)
                 else:
                     errorhandler('strict', public_encoding_name,
                                  'surrogates not allowed',
                                  s, pos-1, pos)
-                j = rutf8.next_codepoint_pos(ru, j)
-            index = newindex
+            if index != newindex:  # Should be uncommon
+                index = newindex
+                pos = rutf8._pos_at_index(s, newindex)
             continue
 
         pos = rutf8.next_codepoint_pos(s, pos)
@@ -1282,22 +1279,19 @@
         ch = rutf8.codepoint_at_pos(s, pos)
         pos = rutf8.next_codepoint_pos(s, pos)
         if not allow_surrogates and 0xD800 <= ch < 0xE000:
-            ru, newindex = errorhandler(errors, public_encoding_name,
-                                        'surrogates not allowed',
-                                        s, pos-1, pos)
-            for j in range(newindex - index):
-                pos = rutf8.next_codepoint_pos(s, pos)
-            j = 0
-            while j < len(ru):
-                ch = rutf8.codepoint_at_pos(ru, j)
-                if ord(ch) < 0xD800:
-                    _STORECHAR32(result, ord(ch), byteorder)
+            res_8, newindex = errorhandler(
+                errors, public_encoding_name, 'surrogates not allowed',
+                s, pos - 1, pos)
+            for ch in rutf8.Utf8StringIterator(res_8):
+                if ch < 0xD800:
+                    _STORECHAR32(result, ch, byteorder)
                 else:
-                    errorhandler('strict', public_encoding_name,
-                                 'surrogates not allowed',
-                                 s, pos-1, pos)
-                j = rutf8.next_codepoint_pos(ru, j)
-            index = newindex
+                    errorhandler(
+                        'strict', public_encoding_name, 'surrogates not 
allowed',
+                        s, pos - 1, pos)
+            if index != newindex:  # Should be uncommon
+                index = newindex
+                pos = rutf8._pos_at_index(s, newindex)
             continue
         _STORECHAR32(result, ch, byteorder)
         index += 1
@@ -1425,8 +1419,7 @@
     lgt = rutf8.check_utf8(r, True)
     return r, pos, lgt
 
-def utf8_encode_charmap(s, errors, errorhandler=None,
-                           mapping=None):
+def utf8_encode_charmap(s, errors, errorhandler=None, mapping=None):
     size = len(s)
     if mapping is None:
         return utf8_encode_latin_1(s, errors, errorhandler=errorhandler)
@@ -1438,31 +1431,29 @@
     index = 0
     while pos < size:
         ch = rutf8.codepoint_at_pos(s, pos)
-
         c = mapping.get(ch, '')
         if len(c) == 0:
-            # collect all unencodable chars. Important for narrow builds.
-            collend = rutf8.next_codepoint_pos(s, pos)
-            endindex = index + 1
-            while collend < size and mapping.get(rutf8.codepoint_at_pos(s, 
collend), '') == '':
-                collend = rutf8.next_codepoint_pos(s, collend)
-                endindex += 1
-            rs, endindex = errorhandler(errors, "charmap",
+            # collect all unencodable chars.
+            startindex = index
+            pos = rutf8.next_codepoint_pos(s, pos)
+            index += 1
+            while (pos < size and
+                   mapping.get(rutf8.codepoint_at_pos(s, pos), '') == ''):
+                pos = rutf8.next_codepoint_pos(s, pos)
+                index += 1
+            res_8, newindex = errorhandler(errors, "charmap",
                                    "character maps to <undefined>",
-                                   s, index, endindex)
-            j = 0
-            for _ in range(endindex - index):
-                ch2 = rutf8.codepoint_at_pos(rs, j)
-                ch2 = mapping.get(ch2, '')
+                                   s, startindex, index)
+            for cp2 in rutf8.Utf8StringIterator(res_8):
+                ch2 = mapping.get(cp2, '')
                 if not ch2:
                     errorhandler(
-                        "strict", "charmap",
-                        "character maps to <undefined>",
-                        s,  index, index + 1)
+                        "strict", "charmap", "character maps to <undefined>",
+                        s,  startindex, index)
                 result.append(ch2)
-                index += 1
-                j = rutf8.next_codepoint_pos(rs, j)
-                pos = rutf8.next_codepoint_pos(s, pos)
+            if index != newindex:  # Should be uncommon
+                index = newindex
+                pos = rutf8._pos_at_index(s, newindex)
             continue
         result.append(c)
         index += 1
diff --git a/pypy/module/_codecs/test/test_codecs.py 
b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -537,8 +537,12 @@
         assert '\xff'.decode('utf-7', 'ignore') == ''
         assert '\x00'.decode('unicode-internal', 'ignore') == ''
 
-    def test_backslahreplace(self):
-        assert u'a\xac\u1234\u20ac\u8000'.encode('ascii', 'backslashreplace') 
== 'a\\xac\u1234\u20ac\u8000'
+    def test_backslashreplace(self):
+        sin = u"a\xac\u1234\u20ac\u8000\U0010ffff"
+        expected = "a\\xac\\u1234\\u20ac\\u8000\\U0010ffff"
+        assert sin.encode('ascii', 'backslashreplace') == expected
+        expected = "a\xac\\u1234\xa4\\u8000\\U0010ffff"
+        assert sin.encode("iso-8859-15", "backslashreplace") == expected
 
     def test_badhandler(self):
         import codecs
diff --git a/pypy/module/_locale/interp_locale.py 
b/pypy/module/_locale/interp_locale.py
--- a/pypy/module/_locale/interp_locale.py
+++ b/pypy/module/_locale/interp_locale.py
@@ -133,10 +133,11 @@
             rffi.free_charp(s1_c)
             rffi.free_charp(s2_c)
 
-    s1, s2 = space.unicode_w(w_s1), space.unicode_w(w_s2)
+    s1, l1 = space.utf8_len_w(w_s1)
+    s2, l2 = space.utf8_len_w(w_s2)
 
-    s1_c = rffi.unicode2wcharp(s1)
-    s2_c = rffi.unicode2wcharp(s2)
+    s1_c = rffi.utf82wcharp(s1, l1)
+    s2_c = rffi.utf82wcharp(s2, l2)
     try:
         result = _wcscoll(s1_c, s2_c)
     finally:
diff --git a/pypy/module/_rawffi/alt/type_converter.py 
b/pypy/module/_rawffi/alt/type_converter.py
--- a/pypy/module/_rawffi/alt/type_converter.py
+++ b/pypy/module/_rawffi/alt/type_converter.py
@@ -227,8 +227,8 @@
             ucharval = self.get_char(w_ffitype)
             return space.newbytes(chr(ucharval))
         elif w_ffitype.is_unichar():
-            wcharval = self.get_unichar(w_ffitype)
-            return space.newutf8(rutf8.unichr_as_utf8(r_uint(wcharval)), 1)
+            wcharval = r_uint(self.get_unichar(w_ffitype))
+            return space.newutf8(rutf8.unichr_as_utf8(wcharval), 1)
         elif w_ffitype.is_double():
             return self._float(w_ffitype)
         elif w_ffitype.is_singlefloat():
diff --git a/pypy/module/_rawffi/interp_rawffi.py 
b/pypy/module/_rawffi/interp_rawffi.py
--- a/pypy/module/_rawffi/interp_rawffi.py
+++ b/pypy/module/_rawffi/interp_rawffi.py
@@ -448,7 +448,8 @@
             elif c == 'c':
                 return space.newbytes(func(add_arg, argdesc, ll_type))
             elif c == 'u':
-                return space.newunicode(func(add_arg, argdesc, ll_type))
+                return space.newutf8(rutf8.unichr_as_utf8(
+                    ord(func(add_arg, argdesc, ll_type))), 1)
             elif c == 'f' or c == 'd' or c == 'g':
                 return space.newfloat(float(func(add_arg, argdesc, ll_type)))
             else:
@@ -596,10 +597,10 @@
         return space.w_None
     wcharp_addr = rffi.cast(rffi.CWCHARP, address)
     if maxlength == -1:
-        s = rffi.wcharp2unicode(wcharp_addr)
+        s, lgt = rffi.wcharp2utf8(wcharp_addr)
     else:
-        s = rffi.wcharp2unicoden(wcharp_addr, maxlength)
-    return space.newunicode(s)
+        s, lgt = rffi.wcharp2utf8n(wcharp_addr, maxlength)
+    return space.newutf8(s, lgt)
 
 @unwrap_spec(address=r_uint, maxlength=int)
 def charp2rawstring(space, address, maxlength=-1):
@@ -612,8 +613,8 @@
 def wcharp2rawunicode(space, address, maxlength=-1):
     if maxlength == -1:
         return wcharp2unicode(space, address)
-    s = rffi.wcharpsize2unicode(rffi.cast(rffi.CWCHARP, address), maxlength)
-    return space.newunicode(s)
+    s = rffi.wcharpsize2utf8(rffi.cast(rffi.CWCHARP, address), maxlength)
+    return space.newutf8(s, maxlength)
 
 @unwrap_spec(address=r_uint, newcontent='bufferstr')
 def rawstring2charp(space, address, newcontent):
diff --git a/rpython/annotator/unaryop.py b/rpython/annotator/unaryop.py
--- a/rpython/annotator/unaryop.py
+++ b/rpython/annotator/unaryop.py
@@ -792,7 +792,7 @@
     def ord(self):
         # warning, on 32-bit with 32-bit unichars, this might return
         # negative numbers
-        return SomeInteger()
+        return SomeInteger(nonneg=True)
 
 class __extend__(SomeIterator):
 
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -19,7 +19,7 @@
 from rpython.rlib.objectmodel import enforceargs, we_are_translated, specialize
 from rpython.rlib.objectmodel import always_inline, dont_inline, try_inline
 from rpython.rlib.rstring import StringBuilder
-from rpython.rlib import jit
+from rpython.rlib import jit, types
 from rpython.rlib.signature import signature
 from rpython.rlib.types import char, none
 from rpython.rlib.rarithmetic import r_uint
@@ -27,6 +27,8 @@
 from rpython.rtyper.lltypesystem import lltype, rffi
 
 
+# we need a way to accept both r_uint and int(nonneg=True)
+#@signature(types.int_nonneg(), types.bool(), returns=types.str())
 def unichr_as_utf8(code, allow_surrogates=False):
     """Encode code (numeric value) as utf8 encoded string
     """
@@ -437,7 +439,7 @@
             low = codepoint_at_pos(utf8, i)
             if 0xDC00 <= low <= 0xDFFF:
                 uchr = 0x10000 + (high - 0xD800) * 0x400 + (low - 0xDC00)
-                i = next_codepoint_pos(utf8, i)                
+                i = next_codepoint_pos(utf8, i)
             # else not really a surrogate pair, just append high
         else:
             i = next_codepoint_pos(utf8, i)
@@ -535,6 +537,13 @@
     else:
         return next_codepoint_pos(utf8, next_codepoint_pos(utf8, bytepos))
 
+def _pos_at_index(utf8, index):
+    # Slow!
+    pos = 0
+    for _ in range(index):
+        pos = next_codepoint_pos(utf8, pos)
+    return pos
+
 @jit.dont_look_inside
 def codepoint_at_index(utf8, storage, index):
     """ Return codepoint of a character inside utf8 encoded string, given
diff --git a/rpython/rlib/types.py b/rpython/rlib/types.py
--- a/rpython/rlib/types.py
+++ b/rpython/rlib/types.py
@@ -26,6 +26,8 @@
 def int():
     return model.SomeInteger()
 
+def int_nonneg():
+    return model.SomeInteger(nonneg=True)
 
 def bool():
     return model.SomeBool()
diff --git a/rpython/rtyper/lltypesystem/rffi.py 
b/rpython/rtyper/lltypesystem/rffi.py
--- a/rpython/rtyper/lltypesystem/rffi.py
+++ b/rpython/rtyper/lltypesystem/rffi.py
@@ -1019,7 +1019,27 @@
     s = StringBuilder(size)
     for i in range(size):
         rutf8.unichr_as_utf8_append(s, ord(w[i]))
-    return s.build()    
+    return s.build()
+
+def wcharp2utf8(w):
+    from rpython.rlib import rutf8
+
+    s = rutf8.Utf8StringBuilder()
+    i = 0
+    while ord(w[i]):
+        s.append_code(ord(w[i]))
+        i += 1
+    return s.build(), i
+
+def wcharp2utf8n(w, maxlen):
+    from rpython.rlib import rutf8
+
+    s = rutf8.Utf8StringBuilder(maxlen)
+    i = 0
+    while i < maxlen and w[i]:
+        s.append_code(ord(w[i]))
+        i += 1
+    return s.build(), i
 
 def utf82wcharp(utf8, utf8len):
     from rpython.rlib import rutf8
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to