Author: Carl Friedrich Bolz-Tereick <[email protected]>
Branch: py3.6
Changeset: r96170:f2a689373046
Date: 2019-02-26 16:01 +0100
http://bitbucket.org/pypy/pypy/changeset/f2a689373046/

Log:    make performance of lower/upper/title/swapcase not terrible for
        strings containing &#931;

diff --git a/pypy/objspace/std/unicodeobject.py 
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -395,73 +395,70 @@
     def descr_swapcase(self, space):
         value = self._utf8
         builder = rutf8.Utf8StringBuilder(len(value))
-        i = 0
-        for ch in rutf8.Utf8StringIterator(value):
+        for ch, pos in rutf8.Utf8StringPosIterator(value):
             if unicodedb.isupper(ch):
-                codes = self._lower_char(ch, value, i)
+                codes = self._lower_char(ch, value, pos)
             elif unicodedb.islower(ch):
                 codes = unicodedb.toupper_full(ch)
             else:
                 codes = [ch,]
             for c in codes:
                 builder.append_code(c)
-            i += 1
         return self.from_utf8builder(builder)
 
     def descr_title(self, space):
         if len(self._utf8) == 0:
             return self
-        return self.title_unicode(self._utf8)
+        return self.title_unicode()
 
     @jit.elidable
-    def title_unicode(self, value):
-        input = self._utf8
-        builder = rutf8.Utf8StringBuilder(len(input))
+    def title_unicode(self):
+        value = self._utf8
+        builder = rutf8.Utf8StringBuilder(len(value))
         previous_is_cased = False
-        i = 0
-        for ch in rutf8.Utf8StringIterator(input):
+        for ch, pos in rutf8.Utf8StringPosIterator(value):
             if previous_is_cased:
-                codes = self._lower_char(ch, value, i)
+                codes = self._lower_char(ch, value, pos)
             else:
                 codes = unicodedb.totitle_full(ch)
             for c in codes:
                 builder.append_code(c)
             previous_is_cased = unicodedb.iscased(ch)
-            i += 1
         return self.from_utf8builder(builder)
 
-    def _lower_char(self, ch, value, i):
+    def _lower_char(self, ch, value, bytepos):
         if ch == 0x3a3:
-            return [self._handle_capital_sigma(value, i), ]
+            return [self._handle_capital_sigma(value, bytepos), ]
         else:
             return unicodedb.tolower_full(ch)
 
-    def _handle_capital_sigma(self, value, i):
+    def _handle_capital_sigma(self, value, bytepos):
         # U+03A3 is in the Final_Sigma context when, it is found like this:
         #\p{cased} \p{case-ignorable}* U+03A3 not(\p{case-ignorable}* 
\p{cased})
         # where \p{xxx} is a character with property xxx.
 
-        # TODO: find a better way for utf8 -> codepoints
-        value = [ch for ch in rutf8.Utf8StringIterator(value)]
-        j = i - 1
         final_sigma = False
-        while j >= 0:
-            ch = value[j]
-            if unicodedb.iscaseignorable(ch):
-                j -= 1
-                continue
-            final_sigma = unicodedb.iscased(ch)
-            break
-        if final_sigma:
-            j = i + 1
-            length = len(value)
-            while j < length:
-                ch = value[j]
+        if bytepos > 0:
+            j = rutf8.prev_codepoint_pos(value, bytepos)
+            while j >= 0:
+                ch = rutf8.codepoint_at_pos(value, j)
                 if unicodedb.iscaseignorable(ch):
-                    j += 1
+                    if j == 0:
+                        break
+                    j = rutf8.prev_codepoint_pos(value, j)
                     continue
-                final_sigma = not unicodedb.iscased(ch)
+                final_sigma = unicodedb.iscased(ch)
                 break
+            if final_sigma and bytepos < len(value):
+                j = rutf8.next_codepoint_pos(value, bytepos)
+                length = len(value)
+                while j < length:
+                    ch = rutf8.codepoint_at_pos(value, j)
+                    if unicodedb.iscaseignorable(ch):
+                        j = rutf8.next_codepoint_pos(value, j)
+                        continue
+                    final_sigma = not unicodedb.iscased(ch)
+                    break
         if final_sigma:
             return 0x3C2
         else:
@@ -597,12 +594,10 @@
     def descr_lower(self, space):
         value = self._utf8
         builder = rutf8.Utf8StringBuilder(len(value))
-        i = 0
-        for ch in rutf8.Utf8StringIterator(value):
-            codes = self._lower_char(ch, value, i)
+        for ch, pos in rutf8.Utf8StringPosIterator(value):
+            codes = self._lower_char(ch, value, pos)
             for c in codes:
                 builder.append_code(c)
-            i += 1
         return self.from_utf8builder(builder)
 
     def descr_isdecimal(self, space):
@@ -879,18 +874,16 @@
 
         value = self._utf8
         builder = rutf8.Utf8StringBuilder(len(value))
-        it = rutf8.Utf8StringIterator(value)
-        uchar = it.next()
+        it = rutf8.Utf8StringPosIterator(value)
+        uchar, _ = it.next()
         codes = unicodedb.toupper_full(uchar)
         # can sometimes give more than one, like for omega-with-Ypogegrammeni, 
8179
         for c in codes:
             builder.append_code(c)
-        i = 1
-        for ch in it:
-            codes = self._lower_char(ch, value, i)
+        for ch, pos in it:
+            codes = self._lower_char(ch, value, pos)
             for c in codes:
                 builder.append_code(c)
-            i += 1
         return self.from_utf8builder(builder)
 
     @unwrap_spec(width=int, w_fillchar=WrappedDefault(u' '))
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to