Author: Ronan Lamy <[email protected]>
Branch: unicode-utf8-test
Changeset: r93344:1665df77270e
Date: 2017-12-10 05:27 +0000
http://bitbucket.org/pypy/pypy/changeset/1665df77270e/
Log: hg merge unicode-utf8
diff --git a/TODO b/TODO
--- a/TODO
+++ b/TODO
@@ -12,3 +12,4 @@
* improve performance of splitlines
* fix _pypyjson to not use a wrapped dict when decoding an object
+* make sure we review all the places that call ord(unichr) to check for
ValueErrors
\ No newline at end of file
diff --git a/pypy/interpreter/unicodehelper.py
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -1098,22 +1098,19 @@
elif ch >= 0xE000 or allow_surrogates:
_STORECHAR(result, ch, byteorder)
else:
- ru, newindex = errorhandler(errors, public_encoding_name,
- 'surrogates not allowed',
- s, pos-1, pos)
- for j in range(newindex - index):
- pos = rutf8.next_codepoint_pos(s, pos)
- j = 0
- while j < len(ru):
- ch = rutf8.codepoint_at_pos(ru, j)
- if ord(ch) < 0xD800:
- _STORECHAR(result, ord(ch), byteorder)
+ res_8, newindex = errorhandler(
+ errors, public_encoding_name, 'surrogates not allowed',
+ s, pos - 1, pos)
+ for cp in rutf8.Utf8StringIterator(res_8):
+ if cp < 0xD800:
+ _STORECHAR(result, cp, byteorder)
else:
errorhandler('strict', public_encoding_name,
'surrogates not allowed',
s, pos-1, pos)
- j = rutf8.next_codepoint_pos(ru, j)
- index = newindex
+ if index != newindex: # Should be uncommon
+ index = newindex
+ pos = rutf8._pos_at_index(s, newindex)
continue
pos = rutf8.next_codepoint_pos(s, pos)
@@ -1282,22 +1279,19 @@
ch = rutf8.codepoint_at_pos(s, pos)
pos = rutf8.next_codepoint_pos(s, pos)
if not allow_surrogates and 0xD800 <= ch < 0xE000:
- ru, newindex = errorhandler(errors, public_encoding_name,
- 'surrogates not allowed',
- s, pos-1, pos)
- for j in range(newindex - index):
- pos = rutf8.next_codepoint_pos(s, pos)
- j = 0
- while j < len(ru):
- ch = rutf8.codepoint_at_pos(ru, j)
- if ord(ch) < 0xD800:
- _STORECHAR32(result, ord(ch), byteorder)
+ res_8, newindex = errorhandler(
+ errors, public_encoding_name, 'surrogates not allowed',
+ s, pos - 1, pos)
+ for ch in rutf8.Utf8StringIterator(res_8):
+ if ch < 0xD800:
+ _STORECHAR32(result, ch, byteorder)
else:
- errorhandler('strict', public_encoding_name,
- 'surrogates not allowed',
- s, pos-1, pos)
- j = rutf8.next_codepoint_pos(ru, j)
- index = newindex
+ errorhandler(
+ 'strict', public_encoding_name, 'surrogates not
allowed',
+ s, pos - 1, pos)
+ if index != newindex: # Should be uncommon
+ index = newindex
+ pos = rutf8._pos_at_index(s, newindex)
continue
_STORECHAR32(result, ch, byteorder)
index += 1
@@ -1425,8 +1419,7 @@
lgt = rutf8.check_utf8(r, True)
return r, pos, lgt
-def utf8_encode_charmap(s, errors, errorhandler=None,
- mapping=None):
+def utf8_encode_charmap(s, errors, errorhandler=None, mapping=None):
size = len(s)
if mapping is None:
return utf8_encode_latin_1(s, errors, errorhandler=errorhandler)
@@ -1438,31 +1431,29 @@
index = 0
while pos < size:
ch = rutf8.codepoint_at_pos(s, pos)
-
c = mapping.get(ch, '')
if len(c) == 0:
- # collect all unencodable chars. Important for narrow builds.
- collend = rutf8.next_codepoint_pos(s, pos)
- endindex = index + 1
- while collend < size and mapping.get(rutf8.codepoint_at_pos(s,
collend), '') == '':
- collend = rutf8.next_codepoint_pos(s, collend)
- endindex += 1
- rs, endindex = errorhandler(errors, "charmap",
+ # collect all unencodable chars.
+ startindex = index
+ pos = rutf8.next_codepoint_pos(s, pos)
+ index += 1
+ while (pos < size and
+ mapping.get(rutf8.codepoint_at_pos(s, pos), '') == ''):
+ pos = rutf8.next_codepoint_pos(s, pos)
+ index += 1
+ res_8, newindex = errorhandler(errors, "charmap",
"character maps to <undefined>",
- s, index, endindex)
- j = 0
- for _ in range(endindex - index):
- ch2 = rutf8.codepoint_at_pos(rs, j)
- ch2 = mapping.get(ch2, '')
+ s, startindex, index)
+ for cp2 in rutf8.Utf8StringIterator(res_8):
+ ch2 = mapping.get(cp2, '')
if not ch2:
errorhandler(
- "strict", "charmap",
- "character maps to <undefined>",
- s, index, index + 1)
+ "strict", "charmap", "character maps to <undefined>",
+ s, startindex, index)
result.append(ch2)
- index += 1
- j = rutf8.next_codepoint_pos(rs, j)
- pos = rutf8.next_codepoint_pos(s, pos)
+ if index != newindex: # Should be uncommon
+ index = newindex
+ pos = rutf8._pos_at_index(s, newindex)
continue
result.append(c)
index += 1
diff --git a/pypy/module/_codecs/test/test_codecs.py
b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -537,8 +537,12 @@
assert '\xff'.decode('utf-7', 'ignore') == ''
assert '\x00'.decode('unicode-internal', 'ignore') == ''
- def test_backslahreplace(self):
- assert u'a\xac\u1234\u20ac\u8000'.encode('ascii', 'backslashreplace')
== 'a\\xac\u1234\u20ac\u8000'
+ def test_backslashreplace(self):
+ sin = u"a\xac\u1234\u20ac\u8000\U0010ffff"
+ expected = "a\\xac\\u1234\\u20ac\\u8000\\U0010ffff"
+ assert sin.encode('ascii', 'backslashreplace') == expected
+ expected = "a\xac\\u1234\xa4\\u8000\\U0010ffff"
+ assert sin.encode("iso-8859-15", "backslashreplace") == expected
def test_badhandler(self):
import codecs
diff --git a/pypy/module/_locale/interp_locale.py
b/pypy/module/_locale/interp_locale.py
--- a/pypy/module/_locale/interp_locale.py
+++ b/pypy/module/_locale/interp_locale.py
@@ -133,10 +133,11 @@
rffi.free_charp(s1_c)
rffi.free_charp(s2_c)
- s1, s2 = space.unicode_w(w_s1), space.unicode_w(w_s2)
+ s1, l1 = space.utf8_len_w(w_s1)
+ s2, l2 = space.utf8_len_w(w_s2)
- s1_c = rffi.unicode2wcharp(s1)
- s2_c = rffi.unicode2wcharp(s2)
+ s1_c = rffi.utf82wcharp(s1, l1)
+ s2_c = rffi.utf82wcharp(s2, l2)
try:
result = _wcscoll(s1_c, s2_c)
finally:
diff --git a/pypy/module/_rawffi/alt/type_converter.py
b/pypy/module/_rawffi/alt/type_converter.py
--- a/pypy/module/_rawffi/alt/type_converter.py
+++ b/pypy/module/_rawffi/alt/type_converter.py
@@ -227,8 +227,8 @@
ucharval = self.get_char(w_ffitype)
return space.newbytes(chr(ucharval))
elif w_ffitype.is_unichar():
- wcharval = self.get_unichar(w_ffitype)
- return space.newutf8(rutf8.unichr_as_utf8(r_uint(wcharval)), 1)
+ wcharval = r_uint(self.get_unichar(w_ffitype))
+ return space.newutf8(rutf8.unichr_as_utf8(wcharval), 1)
elif w_ffitype.is_double():
return self._float(w_ffitype)
elif w_ffitype.is_singlefloat():
diff --git a/pypy/module/_rawffi/interp_rawffi.py
b/pypy/module/_rawffi/interp_rawffi.py
--- a/pypy/module/_rawffi/interp_rawffi.py
+++ b/pypy/module/_rawffi/interp_rawffi.py
@@ -448,7 +448,8 @@
elif c == 'c':
return space.newbytes(func(add_arg, argdesc, ll_type))
elif c == 'u':
- return space.newunicode(func(add_arg, argdesc, ll_type))
+ return space.newutf8(rutf8.unichr_as_utf8(
+ ord(func(add_arg, argdesc, ll_type))), 1)
elif c == 'f' or c == 'd' or c == 'g':
return space.newfloat(float(func(add_arg, argdesc, ll_type)))
else:
@@ -596,10 +597,10 @@
return space.w_None
wcharp_addr = rffi.cast(rffi.CWCHARP, address)
if maxlength == -1:
- s = rffi.wcharp2unicode(wcharp_addr)
+ s, lgt = rffi.wcharp2utf8(wcharp_addr)
else:
- s = rffi.wcharp2unicoden(wcharp_addr, maxlength)
- return space.newunicode(s)
+ s, lgt = rffi.wcharp2utf8n(wcharp_addr, maxlength)
+ return space.newutf8(s, lgt)
@unwrap_spec(address=r_uint, maxlength=int)
def charp2rawstring(space, address, maxlength=-1):
@@ -612,8 +613,8 @@
def wcharp2rawunicode(space, address, maxlength=-1):
if maxlength == -1:
return wcharp2unicode(space, address)
- s = rffi.wcharpsize2unicode(rffi.cast(rffi.CWCHARP, address), maxlength)
- return space.newunicode(s)
+ s = rffi.wcharpsize2utf8(rffi.cast(rffi.CWCHARP, address), maxlength)
+ return space.newutf8(s, maxlength)
@unwrap_spec(address=r_uint, newcontent='bufferstr')
def rawstring2charp(space, address, newcontent):
diff --git a/rpython/annotator/unaryop.py b/rpython/annotator/unaryop.py
--- a/rpython/annotator/unaryop.py
+++ b/rpython/annotator/unaryop.py
@@ -792,7 +792,7 @@
def ord(self):
# warning, on 32-bit with 32-bit unichars, this might return
# negative numbers
- return SomeInteger()
+ return SomeInteger(nonneg=True)
class __extend__(SomeIterator):
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -19,7 +19,7 @@
from rpython.rlib.objectmodel import enforceargs, we_are_translated, specialize
from rpython.rlib.objectmodel import always_inline, dont_inline, try_inline
from rpython.rlib.rstring import StringBuilder
-from rpython.rlib import jit
+from rpython.rlib import jit, types
from rpython.rlib.signature import signature
from rpython.rlib.types import char, none
from rpython.rlib.rarithmetic import r_uint
@@ -27,6 +27,8 @@
from rpython.rtyper.lltypesystem import lltype, rffi
+# we need a way to accept both r_uint and int(nonneg=True)
+#@signature(types.int_nonneg(), types.bool(), returns=types.str())
def unichr_as_utf8(code, allow_surrogates=False):
"""Encode code (numeric value) as utf8 encoded string
"""
@@ -437,7 +439,7 @@
low = codepoint_at_pos(utf8, i)
if 0xDC00 <= low <= 0xDFFF:
uchr = 0x10000 + (high - 0xD800) * 0x400 + (low - 0xDC00)
- i = next_codepoint_pos(utf8, i)
+ i = next_codepoint_pos(utf8, i)
# else not really a surrogate pair, just append high
else:
i = next_codepoint_pos(utf8, i)
@@ -535,6 +537,13 @@
else:
return next_codepoint_pos(utf8, next_codepoint_pos(utf8, bytepos))
+def _pos_at_index(utf8, index):
+ # Slow!
+ pos = 0
+ for _ in range(index):
+ pos = next_codepoint_pos(utf8, pos)
+ return pos
+
@jit.dont_look_inside
def codepoint_at_index(utf8, storage, index):
""" Return codepoint of a character inside utf8 encoded string, given
diff --git a/rpython/rlib/types.py b/rpython/rlib/types.py
--- a/rpython/rlib/types.py
+++ b/rpython/rlib/types.py
@@ -26,6 +26,8 @@
def int():
return model.SomeInteger()
+def int_nonneg():
+ return model.SomeInteger(nonneg=True)
def bool():
return model.SomeBool()
diff --git a/rpython/rtyper/lltypesystem/rffi.py
b/rpython/rtyper/lltypesystem/rffi.py
--- a/rpython/rtyper/lltypesystem/rffi.py
+++ b/rpython/rtyper/lltypesystem/rffi.py
@@ -1019,7 +1019,27 @@
s = StringBuilder(size)
for i in range(size):
rutf8.unichr_as_utf8_append(s, ord(w[i]))
- return s.build()
+ return s.build()
+
+def wcharp2utf8(w):
+ from rpython.rlib import rutf8
+
+ s = rutf8.Utf8StringBuilder()
+ i = 0
+ while ord(w[i]):
+ s.append_code(ord(w[i]))
+ i += 1
+ return s.build(), i
+
+def wcharp2utf8n(w, maxlen):
+ from rpython.rlib import rutf8
+
+ s = rutf8.Utf8StringBuilder(maxlen)
+ i = 0
+ while i < maxlen and w[i]:
+ s.append_code(ord(w[i]))
+ i += 1
+ return s.build(), i
def utf82wcharp(utf8, utf8len):
from rpython.rlib import rutf8
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit