Author: Armin Rigo <[email protected]>
Branch: unicode-utf8
Changeset: r92751:d9fe594a639b
Date: 2017-10-14 07:34 +0200
http://bitbucket.org/pypy/pypy/changeset/d9fe594a639b/

Log:    Force 'allow_surrogates' to be specified in check_utf8()

diff --git a/pypy/module/_codecs/interp_codecs.py 
b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -39,7 +39,8 @@
                 w_input = space.newbytes(input)
             else:
                 w_cls = space.w_UnicodeEncodeError
-                w_input = space.newutf8(input, rutf8.check_utf8(input))
+                length = rutf8.check_utf8(input, allow_surrogates=True)
+                w_input = space.newutf8(input, length)
             w_exc =  space.call_function(
                 w_cls,
                 space.newtext(encoding),
@@ -447,7 +448,8 @@
 # "allow_surrogates=True"
 @unwrap_spec(utf8='utf8', errors='text_or_none')
 def utf_8_encode(space, utf8, errors="strict"):
-    return space.newtuple([space.newbytes(utf8), 
space.newint(rutf8.check_utf8(utf8))])
+    length = rutf8.check_utf8(utf8, allow_surrogates=True)
+    return space.newtuple([space.newbytes(utf8), space.newint(length)])
 #@unwrap_spec(uni=unicode, errors='text_or_none')
 #def utf_8_encode(space, uni, errors="strict"):
 #    if errors is None:
@@ -472,7 +474,7 @@
     state = space.fromcache(CodecState)
     # call the fast version for checking
     try:
-        lgt = rutf8.check_utf8(string)
+        lgt = rutf8.check_utf8(string, allow_surrogates=True)
     except rutf8.CheckError as e:
         # XXX do the way around runicode - we can optimize it later if we
         # decide we care about obscure cases
diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py
--- a/pypy/objspace/std/objspace.py
+++ b/pypy/objspace/std/objspace.py
@@ -317,7 +317,8 @@
         for utf in lst:
             assert utf is not None
             assert isinstance(utf, str)
-            res_w.append(self.newutf8(utf, rutf8.check_utf8(utf)))
+            length = rutf8.check_utf8(utf, allow_surrogates=True)
+            res_w.append(self.newutf8(utf, length))
         return self.newlist(res_w)
 
     def newlist_int(self, list_i):
diff --git a/pypy/objspace/std/unicodeobject.py 
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -435,7 +435,8 @@
         if res_index == -1:
             return space.newint(-1)
 
-        res = rutf8.check_utf8(self._utf8, force_len=res_index) # can't raise
+        res = rutf8.check_utf8(self._utf8, allow_surrogates=True,
+                               force_len=res_index) # can't raise
         return space.newint(res)
 
     def descr_rfind(self, space, w_sub, w_start=None, w_end=None):
@@ -447,7 +448,8 @@
         if res_index == -1:
             return space.newint(-1)
 
-        res = rutf8.check_utf8(self._utf8, force_len=res_index) # can't raise
+        res = rutf8.check_utf8(self._utf8, allow_surrogates=True,
+                               force_len=res_index) # can't raise
         return space.newint(res)
 
     def descr_index(self, space, w_sub, w_start=None, w_end=None):
@@ -460,7 +462,8 @@
             raise oefmt(space.w_ValueError,
                         "substring not found in string.index")
 
-        res = rutf8.check_utf8(self._utf8, force_len=res_index) # can't raise
+        res = rutf8.check_utf8(self._utf8, allow_surrogates=True,
+                               force_len=res_index) # can't raise
         return space.newint(res)
 
     def descr_rindex(self, space, w_sub, w_start=None, w_end=None):
@@ -473,7 +476,8 @@
             raise oefmt(space.w_ValueError,
                         "substring not found in string.rindex")
 
-        res = rutf8.check_utf8(self._utf8, force_len=res_index) # can't raise
+        res = rutf8.check_utf8(self._utf8, allow_surrogates=True,
+                               force_len=res_index) # can't raise
         return space.newint(res)
 
     @specialize.arg(2)
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -290,7 +290,7 @@
 
 
 #@jit.elidable
-def check_utf8(s, allow_surrogates=False, force_len=-1):
+def check_utf8(s, allow_surrogates, force_len=-1):
     """Check that 's' is a utf-8-encoded byte string.
     Returns the length (number of chars) or raise CheckError.
     Note that surrogates are not handled specially here.
@@ -424,8 +424,8 @@
 def codepoint_position_at_index(utf8, storage, index):
     """ Return byte index of a character inside utf8 encoded string, given
     storage of type UTF8_INDEX_STORAGE.  The index must be smaller than
-    the utf8 length: if needed, check explicitly before calling this
-    function.
+    or equal to the utf8 length: if needed, check explicitly before calling
+    this function.
     """
     current = index >> 6
     ofs = ord(storage[current].ofs[(index >> 2) & 0x0F])
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to