Author: Armin Rigo <[email protected]>
Branch: unicode-utf8
Changeset: r92751:d9fe594a639b
Date: 2017-10-14 07:34 +0200
http://bitbucket.org/pypy/pypy/changeset/d9fe594a639b/
Log: Force 'allow_surrogates' to be specified in check_utf8()
diff --git a/pypy/module/_codecs/interp_codecs.py
b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -39,7 +39,8 @@
w_input = space.newbytes(input)
else:
w_cls = space.w_UnicodeEncodeError
- w_input = space.newutf8(input, rutf8.check_utf8(input))
+ length = rutf8.check_utf8(input, allow_surrogates=True)
+ w_input = space.newutf8(input, length)
w_exc = space.call_function(
w_cls,
space.newtext(encoding),
@@ -447,7 +448,8 @@
# "allow_surrogates=True"
@unwrap_spec(utf8='utf8', errors='text_or_none')
def utf_8_encode(space, utf8, errors="strict"):
- return space.newtuple([space.newbytes(utf8),
space.newint(rutf8.check_utf8(utf8))])
+ length = rutf8.check_utf8(utf8, allow_surrogates=True)
+ return space.newtuple([space.newbytes(utf8), space.newint(length)])
#@unwrap_spec(uni=unicode, errors='text_or_none')
#def utf_8_encode(space, uni, errors="strict"):
# if errors is None:
@@ -472,7 +474,7 @@
state = space.fromcache(CodecState)
# call the fast version for checking
try:
- lgt = rutf8.check_utf8(string)
+ lgt = rutf8.check_utf8(string, allow_surrogates=True)
except rutf8.CheckError as e:
# XXX do the way around runicode - we can optimize it later if we
# decide we care about obscure cases
diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py
--- a/pypy/objspace/std/objspace.py
+++ b/pypy/objspace/std/objspace.py
@@ -317,7 +317,8 @@
for utf in lst:
assert utf is not None
assert isinstance(utf, str)
- res_w.append(self.newutf8(utf, rutf8.check_utf8(utf)))
+ length = rutf8.check_utf8(utf, allow_surrogates=True)
+ res_w.append(self.newutf8(utf, length))
return self.newlist(res_w)
def newlist_int(self, list_i):
diff --git a/pypy/objspace/std/unicodeobject.py
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -435,7 +435,8 @@
if res_index == -1:
return space.newint(-1)
- res = rutf8.check_utf8(self._utf8, force_len=res_index) # can't raise
+ res = rutf8.check_utf8(self._utf8, allow_surrogates=True,
+ force_len=res_index) # can't raise
return space.newint(res)
def descr_rfind(self, space, w_sub, w_start=None, w_end=None):
@@ -447,7 +448,8 @@
if res_index == -1:
return space.newint(-1)
- res = rutf8.check_utf8(self._utf8, force_len=res_index) # can't raise
+ res = rutf8.check_utf8(self._utf8, allow_surrogates=True,
+ force_len=res_index) # can't raise
return space.newint(res)
def descr_index(self, space, w_sub, w_start=None, w_end=None):
@@ -460,7 +462,8 @@
raise oefmt(space.w_ValueError,
"substring not found in string.index")
- res = rutf8.check_utf8(self._utf8, force_len=res_index) # can't raise
+ res = rutf8.check_utf8(self._utf8, allow_surrogates=True,
+ force_len=res_index) # can't raise
return space.newint(res)
def descr_rindex(self, space, w_sub, w_start=None, w_end=None):
@@ -473,7 +476,8 @@
raise oefmt(space.w_ValueError,
"substring not found in string.rindex")
- res = rutf8.check_utf8(self._utf8, force_len=res_index) # can't raise
+ res = rutf8.check_utf8(self._utf8, allow_surrogates=True,
+ force_len=res_index) # can't raise
return space.newint(res)
@specialize.arg(2)
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -290,7 +290,7 @@
#@jit.elidable
-def check_utf8(s, allow_surrogates=False, force_len=-1):
+def check_utf8(s, allow_surrogates, force_len=-1):
"""Check that 's' is a utf-8-encoded byte string.
Returns the length (number of chars) or raise CheckError.
Note that surrogates are not handled specially here.
@@ -424,8 +424,8 @@
def codepoint_position_at_index(utf8, storage, index):
""" Return byte index of a character inside utf8 encoded string, given
storage of type UTF8_INDEX_STORAGE. The index must be smaller than
- the utf8 length: if needed, check explicitly before calling this
- function.
+ or equal to the utf8 length: if needed, check explicitly before calling
+ this function.
"""
current = index >> 6
ofs = ord(storage[current].ofs[(index >> 2) & 0x0F])
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit