Author: Ronan Lamy <ronan.l...@gmail.com>
Branch: unicode-utf8-test
Changeset: r93324:e6db8eec731a
Date: 2017-12-09 02:46 +0000
http://bitbucket.org/pypy/pypy/changeset/e6db8eec731a/

Log:    hg merge unicode-utf8

diff --git a/pypy/interpreter/test/test_unicodehelper.py 
b/pypy/interpreter/test/test_unicodehelper.py
--- a/pypy/interpreter/test/test_unicodehelper.py
+++ b/pypy/interpreter/test/test_unicodehelper.py
@@ -1,3 +1,4 @@
+import pytest
 from hypothesis import given, strategies
 
 from rpython.rlib import rutf8
@@ -5,6 +6,7 @@
 from pypy.interpreter.unicodehelper import str_decode_utf8
 from pypy.interpreter.unicodehelper import utf8_encode_ascii, str_decode_ascii
 from pypy.interpreter import unicodehelper as uh
+from pypy.module._codecs.interp_codecs import CodecState
 
 def decode_utf8(u):
     return str_decode_utf8(u, True, "strict", None)
@@ -68,3 +70,16 @@
 def test_unicode_escape(u):
     r = uh.utf8_encode_unicode_escape(u.encode("utf8"), "strict", None)
     assert r == u.encode("unicode-escape")
+
+def test_encode_decimal(space):
+    assert uh.unicode_encode_decimal(u' 12, 34 ', None) == ' 12, 34 '
+    with pytest.raises(ValueError):
+        uh.unicode_encode_decimal(u' 12, \u1234 '.encode('utf8'), None)
+    state = space.fromcache(CodecState)
+    handler = state.encode_error_handler
+    assert uh.unicode_encode_decimal(
+        u'u\u1234\u1235v'.encode('utf8'), 'replace', handler) == 'u??v'
+
+    result = uh.unicode_encode_decimal(
+        u'12\u1234'.encode('utf8'), 'xmlcharrefreplace', handler)
+    assert result == '12&#4660;'
diff --git a/pypy/interpreter/unicodehelper.py 
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -7,6 +7,7 @@
 from rpython.rlib.rstring import StringBuilder
 from rpython.rtyper.lltypesystem import rffi
 from pypy.module._codecs import interp_codecs
+from pypy.module.unicodedata import unicodedb
 
 @specialize.memo()
 def decode_error_handler(space):
@@ -35,6 +36,16 @@
                                              space.newtext(msg)]))
     return raise_unicode_exception_encode
 
+def default_error_encode(
+        errors, encoding, msg, u, startingpos, endingpos):
+    """A default handler, for tests"""
+    assert endingpos >= 0
+    if errors == 'replace':
+        return '?', endingpos
+    if errors == 'ignore':
+        return '', endingpos
+    raise ValueError
+
 def convert_arg_to_w_unicode(space, w_arg, strict=None):
     return space.convert_arg_to_w_unicode(w_arg)
 
@@ -1458,3 +1469,70 @@
         pos = rutf8.next_codepoint_pos(s, pos)
     return result.build()
 
+# ____________________________________________________________
+# Decimal Encoder
+def unicode_encode_decimal(s, errors, errorhandler=None):
+    """Converts whitespace to ' ', decimal characters to their
+    corresponding ASCII digit and all other Latin-1 characters except
+    \0 as-is. Characters outside this range (Unicode ordinals 1-256)
+    are treated as errors. This includes embedded NULL bytes.
+    """
+    if errorhandler is None:
+        errorhandler = default_error_encode
+    result = StringBuilder(len(s))
+    pos = 0
+    i = 0
+    it = rutf8.Utf8StringIterator(s)
+    for ch in it:
+        if unicodedb.isspace(ch):
+            result.append(' ')
+            i += 1
+            continue
+        try:
+            decimal = unicodedb.decimal(ch)
+        except KeyError:
+            pass
+        else:
+            result.append(chr(48 + decimal))
+            i += 1
+            continue
+        if 0 < ch < 256:
+            result.append(chr(ch))
+            i += 1
+            continue
+        # All other characters are considered unencodable
+        start_index = i
+        i += 1
+        while not it.done():
+            ch = rutf8.codepoint_at_pos(s, it.get_pos())
+            try:
+                if (0 < ch < 256 or unicodedb.isspace(ch) or
+                        unicodedb.decimal(ch) >= 0):
+                    break
+            except KeyError:
+                # not a decimal
+                pass
+            if it.done():
+                break
+            ch = next(it)
+            i += 1
+        end_index = i
+        msg = "invalid decimal Unicode string"
+        r, pos = errorhandler(
+            errors, 'decimal', msg, s, start_index, end_index)
+        for ch in rutf8.Utf8StringIterator(r):
+            if unicodedb.isspace(ch):
+                result.append(' ')
+                continue
+            try:
+                decimal = unicodedb.decimal(ch)
+            except KeyError:
+                pass
+            else:
+                result.append(chr(48 + decimal))
+                continue
+            if 0 < ch < 256:
+                result.append(chr(ch))
+                continue
+            errorhandler('strict', 'decimal', msg, s, start_index, end_index)
+    return result.build()
diff --git a/pypy/module/_pypyjson/interp_decoder.py 
b/pypy/module/_pypyjson/interp_decoder.py
--- a/pypy/module/_pypyjson/interp_decoder.py
+++ b/pypy/module/_pypyjson/interp_decoder.py
@@ -3,6 +3,7 @@
 from rpython.rlib.objectmodel import specialize, always_inline, r_dict
 from rpython.rlib import rfloat, runicode, rutf8
 from rpython.rtyper.lltypesystem import lltype, rffi
+from rpython.rlib.rarithmetic import r_uint
 from pypy.interpreter.error import oefmt
 from pypy.interpreter import unicodehelper
 
@@ -366,7 +367,7 @@
             return # help the annotator to know that we'll never go beyond
                    # this point
         #
-        utf8_ch = rutf8.unichr_as_utf8(val, allow_surrogates=True)
+        utf8_ch = rutf8.unichr_as_utf8(r_uint(val), allow_surrogates=True)
         builder.append(utf8_ch)
         return i
 
@@ -400,7 +401,7 @@
                 break
             elif ch == '\\' or ch < '\x20':
                 self.pos = i-1
-                return self.space.unicode_w(self.decode_string_escaped(start))
+                return self.decode_string_escaped(start)
             strhash = intmask((1000003 * strhash) ^ ord(ll_chars[i]))
             bits |= ord(ch)
         length = i - start - 1
diff --git a/pypy/module/_rawffi/alt/type_converter.py 
b/pypy/module/_rawffi/alt/type_converter.py
--- a/pypy/module/_rawffi/alt/type_converter.py
+++ b/pypy/module/_rawffi/alt/type_converter.py
@@ -128,7 +128,7 @@
         intval: lltype.Signed
         """
         self.error(w_ffitype, w_obj)
-        
+
     def handle_unichar(self, w_ffitype, w_obj, intval):
         """
         intval: lltype.Signed
@@ -174,7 +174,7 @@
     def handle_struct_rawffi(self, w_ffitype, w_structinstance):
         """
         This method should be killed as soon as we remove support for _rawffi 
structures
-        
+
         w_structinstance: W_StructureInstance
         """
         self.error(w_ffitype, w_structinstance)
@@ -228,7 +228,7 @@
             return space.newbytes(chr(ucharval))
         elif w_ffitype.is_unichar():
             wcharval = self.get_unichar(w_ffitype)
-            return space.newutf8(rutf8.unichr_as_utf8(wcharval), 1)
+            return space.newutf8(rutf8.unichr_as_utf8(r_uint(wcharval)), 1)
         elif w_ffitype.is_double():
             return self._float(w_ffitype)
         elif w_ffitype.is_singlefloat():
@@ -349,7 +349,7 @@
     def get_struct_rawffi(self, w_ffitype, w_structdescr):
         """
         This should be killed as soon as we kill support for _rawffi structures
-        
+
         Return type: lltype.Unsigned
         (the address of the structure)
         """
diff --git a/pypy/module/_rawffi/interp_rawffi.py 
b/pypy/module/_rawffi/interp_rawffi.py
--- a/pypy/module/_rawffi/interp_rawffi.py
+++ b/pypy/module/_rawffi/interp_rawffi.py
@@ -596,9 +596,9 @@
         return space.w_None
     wcharp_addr = rffi.cast(rffi.CWCHARP, address)
     if maxlength == -1:
-        s = rffi.wcharp2utf8(wcharp_addr)
+        s = rffi.wcharp2unicode(wcharp_addr)
     else:
-        s = rffi.wcharpsize2utf8(wcharp_addr, maxlength)
+        s = rffi.wcharp2unicoden(wcharp_addr, maxlength)
     return space.newunicode(s)
 
 @unwrap_spec(address=r_uint, maxlength=int)
diff --git a/pypy/module/array/interp_array.py 
b/pypy/module/array/interp_array.py
--- a/pypy/module/array/interp_array.py
+++ b/pypy/module/array/interp_array.py
@@ -1,7 +1,7 @@
 from rpython.rlib import jit, rgc, rutf8
 from rpython.rlib.buffer import RawBuffer
 from rpython.rlib.objectmodel import keepalive_until_here
-from rpython.rlib.rarithmetic import ovfcheck, widen
+from rpython.rlib.rarithmetic import ovfcheck, widen, r_uint
 from rpython.rlib.unroll import unrolling_iterable
 from rpython.rtyper.annlowlevel import llstr
 from rpython.rtyper.lltypesystem import lltype, rffi
@@ -1013,7 +1013,7 @@
             elif mytype.typecode == 'c':
                 return space.newbytes(item)
             elif mytype.typecode == 'u':
-                code = ord(item)
+                code = r_uint(ord(item))
                 return space.newutf8(rutf8.unichr_as_utf8(code), 1)
             assert 0, "unreachable"
 
diff --git a/pypy/module/cpyext/longobject.py b/pypy/module/cpyext/longobject.py
--- a/pypy/module/cpyext/longobject.py
+++ b/pypy/module/cpyext/longobject.py
@@ -4,6 +4,7 @@
     CONST_STRING, ADDR, CANNOT_FAIL)
 from pypy.objspace.std.longobject import W_LongObject
 from pypy.interpreter.error import OperationError
+from pypy.interpreter.unicodehelper import wcharpsize2utf8
 from pypy.module.cpyext.intobject import PyInt_AsUnsignedLongMask
 from rpython.rlib.rbigint import rbigint
 
@@ -191,7 +192,7 @@
     string, length gives the number of characters, and base is the radix
     for the conversion.  The radix must be in the range [2, 36]; if it is
     out of range, ValueError will be raised."""
-    w_value = space.newunicode(rffi.wcharpsize2unicode(u, length))
+    w_value = space.newutf8(wcharpsize2utf8(space, u, length), length)
     w_base = space.newint(rffi.cast(lltype.Signed, base))
     return space.call_function(space.w_long, w_value, w_base)
 
diff --git a/pypy/module/cpyext/object.py b/pypy/module/cpyext/object.py
--- a/pypy/module/cpyext/object.py
+++ b/pypy/module/cpyext/object.py
@@ -246,7 +246,7 @@
     the Python expression unicode(o).  Called by the unicode() built-in
     function."""
     if w_obj is None:
-        return space.newunicode(u"<NULL>")
+        return space.newutf8("<NULL>", 6)
     return space.call_function(space.w_unicode, w_obj)
 
 @cpython_api([PyObject, PyObject], rffi.INT_real, error=-1)
@@ -302,7 +302,7 @@
         if opid == Py_EQ:
             return 1
         if opid == Py_NE:
-            return 0 
+            return 0
     w_res = PyObject_RichCompare(space, w_o1, w_o2, opid_int)
     return int(space.is_true(w_res))
 
diff --git a/pypy/module/cpyext/unicodeobject.py 
b/pypy/module/cpyext/unicodeobject.py
--- a/pypy/module/cpyext/unicodeobject.py
+++ b/pypy/module/cpyext/unicodeobject.py
@@ -3,7 +3,9 @@
 from rpython.tool.sourcetools import func_renamer
 
 from pypy.interpreter.error import OperationError, oefmt
-from pypy.interpreter.unicodehelper import wcharpsize2utf8
+from pypy.interpreter.unicodehelper import (
+    wcharpsize2utf8, str_decode_utf_16_helper, str_decode_utf_32_helper,
+    unicode_encode_decimal)
 from pypy.module.unicodedata import unicodedb
 from pypy.module.cpyext.api import (
     CANNOT_FAIL, Py_ssize_t, build_type_checkers_flags, cpython_api,
@@ -568,15 +570,11 @@
     else:
         errors = None
 
-    result, length, byteorder = runicode.str_decode_utf_16_helper(
-        string, size, errors,
-        True, # final ? false for multiple passes?
-        None, # errorhandler
-        byteorder)
+    result, _,  length, byteorder = str_decode_utf_16_helper(
+        string, errors, final=True, errorhandler=None, byteorder=byteorder)
     if pbyteorder is not None:
         pbyteorder[0] = rffi.cast(rffi.INT, byteorder)
-
-    return space.newunicode(result)
+    return space.newutf8(result, length)
 
 @cpython_api([CONST_STRING, Py_ssize_t, CONST_STRING, rffi.INTP], PyObject)
 def PyUnicode_DecodeUTF32(space, s, size, llerrors, pbyteorder):
@@ -624,15 +622,11 @@
     else:
         errors = None
 
-    result, length, byteorder = runicode.str_decode_utf_32_helper(
-        string, size, errors,
-        True, # final ? false for multiple passes?
-        None, # errorhandler
-        byteorder)
+    result, _,  length, byteorder = str_decode_utf_32_helper(
+        string, errors, final=True, errorhandler=None, byteorder=byteorder)
     if pbyteorder is not None:
         pbyteorder[0] = rffi.cast(rffi.INT, byteorder)
-
-    return space.newunicode(result)
+    return space.newutf8(result, length)
 
 @cpython_api([rffi.CWCHARP, Py_ssize_t, rffi.CCHARP, CONST_STRING],
              rffi.INT_real, error=-1)
@@ -650,14 +644,13 @@
 
     Returns 0 on success, -1 on failure.
     """
-    u = rffi.wcharpsize2unicode(s, length)
+    u = rffi.wcharpsize2utf8(s, length)
     if llerrors:
         errors = rffi.charp2str(llerrors)
     else:
         errors = None
     state = space.fromcache(CodecState)
-    result = runicode.unicode_encode_decimal(u, length, errors,
-                                             state.encode_error_handler)
+    result = unicode_encode_decimal(u, errors, state.encode_error_handler)
     i = len(result)
     output[i] = '\0'
     i -= 1
@@ -710,12 +703,17 @@
     """Return 1 if substr matches str[start:end] at the given tail end
     (direction == -1 means to do a prefix match, direction == 1 a
     suffix match), 0 otherwise. Return -1 if an error occurred."""
+    space.utf8_w(w_str)  # type check
+    space.utf8_w(w_substr)
     w_start = space.newint(start)
     w_end = space.newint(end)
     if rffi.cast(lltype.Signed, direction) <= 0:
-        return space.call_method(w_str, "startswith", w_substr, w_start, w_end)
+        w_result = space.call_method(
+            w_str, "startswith", w_substr, w_start, w_end)
     else:
-        return space.call_method(w_str, "endswith", w_substr, w_start, w_end)
+        w_result = space.call_method(
+            w_str, "endswith", w_substr, w_start, w_end)
+    return space.int_w(w_result)
 
 @cpython_api([PyObject, PyObject, Py_ssize_t, Py_ssize_t], Py_ssize_t, 
error=-1)
 def PyUnicode_Count(space, w_str, w_substr, start, end):
diff --git a/pypy/module/pyexpat/interp_pyexpat.py 
b/pypy/module/pyexpat/interp_pyexpat.py
--- a/pypy/module/pyexpat/interp_pyexpat.py
+++ b/pypy/module/pyexpat/interp_pyexpat.py
@@ -483,7 +483,7 @@
             except rutf8.CheckError:
                 from pypy.interpreter import unicodehelper
                 # get the correct error msg
-                unicodehelper.str_decode_utf8(s, len(s), 'string', True,
+                unicodehelper.str_decode_utf8(s, 'string', True,
                     unicodehelper.decode_error_handler(space))
                 assert False, "always raises"
         else:
diff --git a/pypy/objspace/std/formatting.py b/pypy/objspace/std/formatting.py
--- a/pypy/objspace/std/formatting.py
+++ b/pypy/objspace/std/formatting.py
@@ -3,7 +3,7 @@
 
 from rpython.rlib import jit, rutf8
 from rpython.rlib.objectmodel import specialize
-from rpython.rlib.rarithmetic import INT_MAX
+from rpython.rlib.rarithmetic import INT_MAX, r_uint
 from rpython.rlib.rfloat import DTSF_ALT, formatd, isnan, isinf
 from rpython.rlib.rstring import StringBuilder
 from rpython.rlib.unroll import unrolling_iterable
@@ -330,7 +330,7 @@
             space = self.space
             if do_unicode:
                 cp = rutf8.codepoint_at_pos(self.fmt, self.fmtpos - 1)
-                w_s = space.newutf8(rutf8.unichr_as_utf8(cp), 1)
+                w_s = space.newutf8(rutf8.unichr_as_utf8(r_uint(cp)), 1)
             else:
                 cp = ord(self.fmt[self.fmtpos - 1])
                 w_s = space.newbytes(chr(cp))
@@ -466,7 +466,7 @@
                 n = space.int_w(w_value)
                 if do_unicode:
                     try:
-                        c = rutf8.unichr_as_utf8(n)
+                        c = rutf8.unichr_as_utf8(r_uint(n))
                     except ValueError:
                         raise oefmt(space.w_OverflowError,
                                     "unicode character code out of range")
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to