[pypy-commit] pypy unicode-utf8: whack a few more places, handle surrogates correctly

fijal Thu, 07 Dec 2017 08:07:01 -0800

Author: fijal
Branch: unicode-utf8
Changeset: r93298:8d468e08f3fe
Date: 2017-12-07 18:03 +0200
http://bitbucket.org/pypy/pypy/changeset/8d468e08f3fe/


Log:    whack a few more places, handle surrogates correctly

diff --git a/pypy/objspace/std/formatting.py b/pypy/objspace/std/formatting.py
--- a/pypy/objspace/std/formatting.py
+++ b/pypy/objspace/std/formatting.py
@@ -330,8 +330,7 @@
             space = self.space
             if do_unicode:
                 cp = rutf8.codepoint_at_pos(self.fmt, self.fmtpos - 1)
-                flag = rutf8.get_flag_from_code(cp)
-                w_s = space.newutf8(rutf8.unichr_as_utf8(cp), 1, flag)
+                w_s = space.newutf8(rutf8.unichr_as_utf8(cp), 1)
             else:
                 cp = ord(self.fmt[self.fmtpos - 1])
                 w_s = space.newbytes(chr(cp))
diff --git a/pypy/objspace/std/unicodeobject.py 
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -170,7 +170,8 @@
     def _istitle(self, ch):
         return unicodedb.isupper(ch) or unicodedb.istitle(ch)
 
-    def _isspace(self, ch):
+    @staticmethod
+    def _isspace(ch):
         return unicodedb.isspace(ch)
 
     def _isalpha(self, ch):
@@ -188,8 +189,8 @@
     def _iscased(self, ch):
         return unicodedb.iscased(ch)
 
-    def _islinebreak(self, s, pos):
-        return rutf8.islinebreak(s, pos)
+    def _islinebreak(self, ch):
+        return unicodedb.islinebreak(ch)
 
     @staticmethod
     @unwrap_spec(w_string=WrappedDefault(""))
@@ -610,7 +611,7 @@
         while pos < length:
             sol = pos
             lgt = 0
-            while pos < length and not self._islinebreak(value, pos):
+            while pos < length and not 
self._islinebreak(rutf8.codepoint_at_pos(value, pos)):
                 pos = rutf8.next_codepoint_pos(value, pos)
                 lgt += 1
             eol = pos
@@ -792,7 +793,7 @@
         if pos < 0:
             return space.newtuple([self, self._empty(), self._empty()])
         else:
-            lgt, _ = rutf8.check_utf8(value, True, stop=pos)
+            lgt = rutf8.check_utf8(value, True, stop=pos)
             return space.newtuple(
                 [W_UnicodeObject(value[0:pos], lgt), w_sub,
                  W_UnicodeObject(value[pos + len(sub._utf8):len(value)],
@@ -810,7 +811,7 @@
         if pos < 0:
             return space.newtuple([self._empty(), self._empty(), self])
         else:
-            lgt, _ = rutf8.check_utf8(value, True, stop=pos)
+            lgt = rutf8.check_utf8(value, True, stop=pos)
             return space.newtuple(
                 [W_UnicodeObject(value[0:pos], lgt), w_sub,
                  W_UnicodeObject(value[pos + len(sub._utf8):len(value)],
@@ -1087,7 +1088,10 @@
             return space.newbytes(s)
         if ((encoding is None and space.sys.defaultencoding == 'utf8') or
              encoding == 'utf-8' or encoding == 'utf8' or encoding == 'UTF-8'):
-            return space.newbytes(space.utf8_w(w_object))
+            utf8 = space.utf8_w(w_object)
+            if rutf8.has_surrogates(utf8):
+                utf8 = rutf8.reencode_utf8_with_surrogates(utf8)
+            return space.newbytes(utf8)
     if w_encoder is None:
         from pypy.module._codecs.interp_codecs import lookup_codec
         w_encoder = space.getitem(lookup_codec(space, encoding), 
space.newint(0))
@@ -1728,14 +1732,12 @@
     result = ['\0'] * w_unistr._length
     digits = ['0', '1', '2', '3', '4',
               '5', '6', '7', '8', '9']
-    i = 0
     res_pos = 0
-    while i < len(unistr):
-        uchr = rutf8.codepoint_at_pos(unistr, i)
-        if rutf8.isspace(unistr, i):
+    iter = rutf8.Utf8StringIterator(unistr)
+    for uchr in iter:
+        if W_UnicodeObject._isspace(uchr):
             result[res_pos] = ' '
             res_pos += 1
-            i = rutf8.next_codepoint_pos(unistr, i)
             continue
         try:
             result[res_pos] = digits[unicodedb.decimal(uchr)]
@@ -1744,14 +1746,14 @@
                 result[res_pos] = chr(uchr)
             else:
                 w_encoding = space.newtext('decimal')
-                w_start = space.newint(i)
-                w_end = space.newint(i+1)
+                pos = iter.get_pos()
+                w_start = space.newint(pos)
+                w_end = space.newint(pos+1)
                 w_reason = space.newtext('invalid decimal Unicode string')
                 raise OperationError(space.w_UnicodeEncodeError,
                                      space.newtuple([w_encoding, w_unistr,
                                                      w_start, w_end,
                                                      w_reason]))
-        i = rutf8.next_codepoint_pos(unistr, i)
         res_pos += 1
     return ''.join(result)
 
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -410,6 +410,13 @@
     assert pos - continuation_bytes >= 0
     return pos - continuation_bytes
 
+def has_surrogates(utf8):
+    # XXX write a faster version maybe
+    for ch in Utf8StringIterator(utf8):
+        if 0xD800 <= ch <= 0xDBFF:
+            return True
+    return False
+
 def reencode_utf8_with_surrogates(utf8):
     """ Receiving valid UTF8 which contains surrogates, combine surrogate
     pairs into correct UTF8 with pairs collpased. This is a rare case
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy unicode-utf8: whack a few more places, handle surrogates correctly

Reply via email to