Author: Ronan Lamy <[email protected]>
Branch: unicode-utf8-test
Changeset: r93324:e6db8eec731a
Date: 2017-12-09 02:46 +0000
http://bitbucket.org/pypy/pypy/changeset/e6db8eec731a/
Log: hg merge unicode-utf8
diff --git a/pypy/interpreter/test/test_unicodehelper.py
b/pypy/interpreter/test/test_unicodehelper.py
--- a/pypy/interpreter/test/test_unicodehelper.py
+++ b/pypy/interpreter/test/test_unicodehelper.py
@@ -1,3 +1,4 @@
+import pytest
from hypothesis import given, strategies
from rpython.rlib import rutf8
@@ -5,6 +6,7 @@
from pypy.interpreter.unicodehelper import str_decode_utf8
from pypy.interpreter.unicodehelper import utf8_encode_ascii, str_decode_ascii
from pypy.interpreter import unicodehelper as uh
+from pypy.module._codecs.interp_codecs import CodecState
def decode_utf8(u):
return str_decode_utf8(u, True, "strict", None)
@@ -68,3 +70,16 @@
def test_unicode_escape(u):
r = uh.utf8_encode_unicode_escape(u.encode("utf8"), "strict", None)
assert r == u.encode("unicode-escape")
+
+def test_encode_decimal(space):
+ assert uh.unicode_encode_decimal(u' 12, 34 ', None) == ' 12, 34 '
+ with pytest.raises(ValueError):
+ uh.unicode_encode_decimal(u' 12, \u1234 '.encode('utf8'), None)
+ state = space.fromcache(CodecState)
+ handler = state.encode_error_handler
+ assert uh.unicode_encode_decimal(
+ u'u\u1234\u1235v'.encode('utf8'), 'replace', handler) == 'u??v'
+
+ result = uh.unicode_encode_decimal(
+ u'12\u1234'.encode('utf8'), 'xmlcharrefreplace', handler)
+ assert result == '12ሴ'
diff --git a/pypy/interpreter/unicodehelper.py
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -7,6 +7,7 @@
from rpython.rlib.rstring import StringBuilder
from rpython.rtyper.lltypesystem import rffi
from pypy.module._codecs import interp_codecs
+from pypy.module.unicodedata import unicodedb
@specialize.memo()
def decode_error_handler(space):
@@ -35,6 +36,16 @@
space.newtext(msg)]))
return raise_unicode_exception_encode
+def default_error_encode(
+ errors, encoding, msg, u, startingpos, endingpos):
+ """A default handler, for tests"""
+ assert endingpos >= 0
+ if errors == 'replace':
+ return '?', endingpos
+ if errors == 'ignore':
+ return '', endingpos
+ raise ValueError
+
def convert_arg_to_w_unicode(space, w_arg, strict=None):
return space.convert_arg_to_w_unicode(w_arg)
@@ -1458,3 +1469,70 @@
pos = rutf8.next_codepoint_pos(s, pos)
return result.build()
+# ____________________________________________________________
+# Decimal Encoder
+def unicode_encode_decimal(s, errors, errorhandler=None):
+ """Converts whitespace to ' ', decimal characters to their
+ corresponding ASCII digit and all other Latin-1 characters except
+ \0 as-is. Characters outside this range (Unicode ordinals 1-256)
+ are treated as errors. This includes embedded NULL bytes.
+ """
+ if errorhandler is None:
+ errorhandler = default_error_encode
+ result = StringBuilder(len(s))
+ pos = 0
+ i = 0
+ it = rutf8.Utf8StringIterator(s)
+ for ch in it:
+ if unicodedb.isspace(ch):
+ result.append(' ')
+ i += 1
+ continue
+ try:
+ decimal = unicodedb.decimal(ch)
+ except KeyError:
+ pass
+ else:
+ result.append(chr(48 + decimal))
+ i += 1
+ continue
+ if 0 < ch < 256:
+ result.append(chr(ch))
+ i += 1
+ continue
+ # All other characters are considered unencodable
+ start_index = i
+ i += 1
+ while not it.done():
+ ch = rutf8.codepoint_at_pos(s, it.get_pos())
+ try:
+ if (0 < ch < 256 or unicodedb.isspace(ch) or
+ unicodedb.decimal(ch) >= 0):
+ break
+ except KeyError:
+ # not a decimal
+ pass
+ if it.done():
+ break
+ ch = next(it)
+ i += 1
+ end_index = i
+ msg = "invalid decimal Unicode string"
+ r, pos = errorhandler(
+ errors, 'decimal', msg, s, start_index, end_index)
+ for ch in rutf8.Utf8StringIterator(r):
+ if unicodedb.isspace(ch):
+ result.append(' ')
+ continue
+ try:
+ decimal = unicodedb.decimal(ch)
+ except KeyError:
+ pass
+ else:
+ result.append(chr(48 + decimal))
+ continue
+ if 0 < ch < 256:
+ result.append(chr(ch))
+ continue
+ errorhandler('strict', 'decimal', msg, s, start_index, end_index)
+ return result.build()
diff --git a/pypy/module/_pypyjson/interp_decoder.py
b/pypy/module/_pypyjson/interp_decoder.py
--- a/pypy/module/_pypyjson/interp_decoder.py
+++ b/pypy/module/_pypyjson/interp_decoder.py
@@ -3,6 +3,7 @@
from rpython.rlib.objectmodel import specialize, always_inline, r_dict
from rpython.rlib import rfloat, runicode, rutf8
from rpython.rtyper.lltypesystem import lltype, rffi
+from rpython.rlib.rarithmetic import r_uint
from pypy.interpreter.error import oefmt
from pypy.interpreter import unicodehelper
@@ -366,7 +367,7 @@
return # help the annotator to know that we'll never go beyond
# this point
#
- utf8_ch = rutf8.unichr_as_utf8(val, allow_surrogates=True)
+ utf8_ch = rutf8.unichr_as_utf8(r_uint(val), allow_surrogates=True)
builder.append(utf8_ch)
return i
@@ -400,7 +401,7 @@
break
elif ch == '\\' or ch < '\x20':
self.pos = i-1
- return self.space.unicode_w(self.decode_string_escaped(start))
+ return self.decode_string_escaped(start)
strhash = intmask((1000003 * strhash) ^ ord(ll_chars[i]))
bits |= ord(ch)
length = i - start - 1
diff --git a/pypy/module/_rawffi/alt/type_converter.py
b/pypy/module/_rawffi/alt/type_converter.py
--- a/pypy/module/_rawffi/alt/type_converter.py
+++ b/pypy/module/_rawffi/alt/type_converter.py
@@ -128,7 +128,7 @@
intval: lltype.Signed
"""
self.error(w_ffitype, w_obj)
-
+
def handle_unichar(self, w_ffitype, w_obj, intval):
"""
intval: lltype.Signed
@@ -174,7 +174,7 @@
def handle_struct_rawffi(self, w_ffitype, w_structinstance):
"""
This method should be killed as soon as we remove support for _rawffi
structures
-
+
w_structinstance: W_StructureInstance
"""
self.error(w_ffitype, w_structinstance)
@@ -228,7 +228,7 @@
return space.newbytes(chr(ucharval))
elif w_ffitype.is_unichar():
wcharval = self.get_unichar(w_ffitype)
- return space.newutf8(rutf8.unichr_as_utf8(wcharval), 1)
+ return space.newutf8(rutf8.unichr_as_utf8(r_uint(wcharval)), 1)
elif w_ffitype.is_double():
return self._float(w_ffitype)
elif w_ffitype.is_singlefloat():
@@ -349,7 +349,7 @@
def get_struct_rawffi(self, w_ffitype, w_structdescr):
"""
This should be killed as soon as we kill support for _rawffi structures
-
+
Return type: lltype.Unsigned
(the address of the structure)
"""
diff --git a/pypy/module/_rawffi/interp_rawffi.py
b/pypy/module/_rawffi/interp_rawffi.py
--- a/pypy/module/_rawffi/interp_rawffi.py
+++ b/pypy/module/_rawffi/interp_rawffi.py
@@ -596,9 +596,9 @@
return space.w_None
wcharp_addr = rffi.cast(rffi.CWCHARP, address)
if maxlength == -1:
- s = rffi.wcharp2utf8(wcharp_addr)
+ s = rffi.wcharp2unicode(wcharp_addr)
else:
- s = rffi.wcharpsize2utf8(wcharp_addr, maxlength)
+ s = rffi.wcharp2unicoden(wcharp_addr, maxlength)
return space.newunicode(s)
@unwrap_spec(address=r_uint, maxlength=int)
diff --git a/pypy/module/array/interp_array.py
b/pypy/module/array/interp_array.py
--- a/pypy/module/array/interp_array.py
+++ b/pypy/module/array/interp_array.py
@@ -1,7 +1,7 @@
from rpython.rlib import jit, rgc, rutf8
from rpython.rlib.buffer import RawBuffer
from rpython.rlib.objectmodel import keepalive_until_here
-from rpython.rlib.rarithmetic import ovfcheck, widen
+from rpython.rlib.rarithmetic import ovfcheck, widen, r_uint
from rpython.rlib.unroll import unrolling_iterable
from rpython.rtyper.annlowlevel import llstr
from rpython.rtyper.lltypesystem import lltype, rffi
@@ -1013,7 +1013,7 @@
elif mytype.typecode == 'c':
return space.newbytes(item)
elif mytype.typecode == 'u':
- code = ord(item)
+ code = r_uint(ord(item))
return space.newutf8(rutf8.unichr_as_utf8(code), 1)
assert 0, "unreachable"
diff --git a/pypy/module/cpyext/longobject.py b/pypy/module/cpyext/longobject.py
--- a/pypy/module/cpyext/longobject.py
+++ b/pypy/module/cpyext/longobject.py
@@ -4,6 +4,7 @@
CONST_STRING, ADDR, CANNOT_FAIL)
from pypy.objspace.std.longobject import W_LongObject
from pypy.interpreter.error import OperationError
+from pypy.interpreter.unicodehelper import wcharpsize2utf8
from pypy.module.cpyext.intobject import PyInt_AsUnsignedLongMask
from rpython.rlib.rbigint import rbigint
@@ -191,7 +192,7 @@
string, length gives the number of characters, and base is the radix
for the conversion. The radix must be in the range [2, 36]; if it is
out of range, ValueError will be raised."""
- w_value = space.newunicode(rffi.wcharpsize2unicode(u, length))
+ w_value = space.newutf8(wcharpsize2utf8(space, u, length), length)
w_base = space.newint(rffi.cast(lltype.Signed, base))
return space.call_function(space.w_long, w_value, w_base)
diff --git a/pypy/module/cpyext/object.py b/pypy/module/cpyext/object.py
--- a/pypy/module/cpyext/object.py
+++ b/pypy/module/cpyext/object.py
@@ -246,7 +246,7 @@
the Python expression unicode(o). Called by the unicode() built-in
function."""
if w_obj is None:
- return space.newunicode(u"<NULL>")
+ return space.newutf8("<NULL>", 6)
return space.call_function(space.w_unicode, w_obj)
@cpython_api([PyObject, PyObject], rffi.INT_real, error=-1)
@@ -302,7 +302,7 @@
if opid == Py_EQ:
return 1
if opid == Py_NE:
- return 0
+ return 0
w_res = PyObject_RichCompare(space, w_o1, w_o2, opid_int)
return int(space.is_true(w_res))
diff --git a/pypy/module/cpyext/unicodeobject.py
b/pypy/module/cpyext/unicodeobject.py
--- a/pypy/module/cpyext/unicodeobject.py
+++ b/pypy/module/cpyext/unicodeobject.py
@@ -3,7 +3,9 @@
from rpython.tool.sourcetools import func_renamer
from pypy.interpreter.error import OperationError, oefmt
-from pypy.interpreter.unicodehelper import wcharpsize2utf8
+from pypy.interpreter.unicodehelper import (
+ wcharpsize2utf8, str_decode_utf_16_helper, str_decode_utf_32_helper,
+ unicode_encode_decimal)
from pypy.module.unicodedata import unicodedb
from pypy.module.cpyext.api import (
CANNOT_FAIL, Py_ssize_t, build_type_checkers_flags, cpython_api,
@@ -568,15 +570,11 @@
else:
errors = None
- result, length, byteorder = runicode.str_decode_utf_16_helper(
- string, size, errors,
- True, # final ? false for multiple passes?
- None, # errorhandler
- byteorder)
+ result, _, length, byteorder = str_decode_utf_16_helper(
+ string, errors, final=True, errorhandler=None, byteorder=byteorder)
if pbyteorder is not None:
pbyteorder[0] = rffi.cast(rffi.INT, byteorder)
-
- return space.newunicode(result)
+ return space.newutf8(result, length)
@cpython_api([CONST_STRING, Py_ssize_t, CONST_STRING, rffi.INTP], PyObject)
def PyUnicode_DecodeUTF32(space, s, size, llerrors, pbyteorder):
@@ -624,15 +622,11 @@
else:
errors = None
- result, length, byteorder = runicode.str_decode_utf_32_helper(
- string, size, errors,
- True, # final ? false for multiple passes?
- None, # errorhandler
- byteorder)
+ result, _, length, byteorder = str_decode_utf_32_helper(
+ string, errors, final=True, errorhandler=None, byteorder=byteorder)
if pbyteorder is not None:
pbyteorder[0] = rffi.cast(rffi.INT, byteorder)
-
- return space.newunicode(result)
+ return space.newutf8(result, length)
@cpython_api([rffi.CWCHARP, Py_ssize_t, rffi.CCHARP, CONST_STRING],
rffi.INT_real, error=-1)
@@ -650,14 +644,13 @@
Returns 0 on success, -1 on failure.
"""
- u = rffi.wcharpsize2unicode(s, length)
+ u = rffi.wcharpsize2utf8(s, length)
if llerrors:
errors = rffi.charp2str(llerrors)
else:
errors = None
state = space.fromcache(CodecState)
- result = runicode.unicode_encode_decimal(u, length, errors,
- state.encode_error_handler)
+ result = unicode_encode_decimal(u, errors, state.encode_error_handler)
i = len(result)
output[i] = '\0'
i -= 1
@@ -710,12 +703,17 @@
"""Return 1 if substr matches str[start:end] at the given tail end
(direction == -1 means to do a prefix match, direction == 1 a
suffix match), 0 otherwise. Return -1 if an error occurred."""
+ space.utf8_w(w_str) # type check
+ space.utf8_w(w_substr)
w_start = space.newint(start)
w_end = space.newint(end)
if rffi.cast(lltype.Signed, direction) <= 0:
- return space.call_method(w_str, "startswith", w_substr, w_start, w_end)
+ w_result = space.call_method(
+ w_str, "startswith", w_substr, w_start, w_end)
else:
- return space.call_method(w_str, "endswith", w_substr, w_start, w_end)
+ w_result = space.call_method(
+ w_str, "endswith", w_substr, w_start, w_end)
+ return space.int_w(w_result)
@cpython_api([PyObject, PyObject, Py_ssize_t, Py_ssize_t], Py_ssize_t,
error=-1)
def PyUnicode_Count(space, w_str, w_substr, start, end):
diff --git a/pypy/module/pyexpat/interp_pyexpat.py
b/pypy/module/pyexpat/interp_pyexpat.py
--- a/pypy/module/pyexpat/interp_pyexpat.py
+++ b/pypy/module/pyexpat/interp_pyexpat.py
@@ -483,7 +483,7 @@
except rutf8.CheckError:
from pypy.interpreter import unicodehelper
# get the correct error msg
- unicodehelper.str_decode_utf8(s, len(s), 'string', True,
+ unicodehelper.str_decode_utf8(s, 'string', True,
unicodehelper.decode_error_handler(space))
assert False, "always raises"
else:
diff --git a/pypy/objspace/std/formatting.py b/pypy/objspace/std/formatting.py
--- a/pypy/objspace/std/formatting.py
+++ b/pypy/objspace/std/formatting.py
@@ -3,7 +3,7 @@
from rpython.rlib import jit, rutf8
from rpython.rlib.objectmodel import specialize
-from rpython.rlib.rarithmetic import INT_MAX
+from rpython.rlib.rarithmetic import INT_MAX, r_uint
from rpython.rlib.rfloat import DTSF_ALT, formatd, isnan, isinf
from rpython.rlib.rstring import StringBuilder
from rpython.rlib.unroll import unrolling_iterable
@@ -330,7 +330,7 @@
space = self.space
if do_unicode:
cp = rutf8.codepoint_at_pos(self.fmt, self.fmtpos - 1)
- w_s = space.newutf8(rutf8.unichr_as_utf8(cp), 1)
+ w_s = space.newutf8(rutf8.unichr_as_utf8(r_uint(cp)), 1)
else:
cp = ord(self.fmt[self.fmtpos - 1])
w_s = space.newbytes(chr(cp))
@@ -466,7 +466,7 @@
n = space.int_w(w_value)
if do_unicode:
try:
- c = rutf8.unichr_as_utf8(n)
+ c = rutf8.unichr_as_utf8(r_uint(n))
except ValueError:
raise oefmt(space.w_OverflowError,
"unicode character code out of range")
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit