[pypy-commit] pypy unicode-utf8: replace a lot of uses of StringBuilder by Utf8StringBuilder

cfbolz Fri, 24 Nov 2017 07:14:11 -0800

Author: Carl Friedrich Bolz-Tereick <[email protected]>
Branch: unicode-utf8
Changeset: r93165:48da1a44d860
Date: 2017-11-24 16:12 +0100
http://bitbucket.org/pypy/pypy/changeset/48da1a44d860/


Log:    replace a lot of uses of StringBuilder by Utf8StringBuilder

diff --git a/pypy/objspace/std/unicodeobject.py 
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -64,6 +64,11 @@
         # - malloced object, which means it has index, then
         #   _index_storage.flags determines the kind
 
+    @staticmethod
+    def from_utf8builder(builder):
+        return W_UnicodeObject(
+            builder.build(), builder.get_length(), builder.get_flag())
+
     def __repr__(self):
         """representation for debugging purposes"""
         return "%s(%r)" % (self.__class__.__name__, self._utf8)
@@ -344,57 +349,38 @@
         return mod_format(space, w_values, self, do_unicode=True)
 
     def descr_swapcase(self, space):
-        selfvalue = self._utf8
-        builder = StringBuilder(len(selfvalue))
-        flag = self._get_flag()
-        i = 0
-        while i < len(selfvalue):
-            ch = rutf8.codepoint_at_pos(selfvalue, i)
-            i = rutf8.next_codepoint_pos(selfvalue, i)
+        input = self._utf8
+        builder = rutf8.Utf8StringBuilder(len(input))
+        for ch in rutf8.Utf8StringIterator(input):
             if unicodedb.isupper(ch):
                 ch = unicodedb.tolower(ch)
             elif unicodedb.islower(ch):
                 ch = unicodedb.toupper(ch)
-            if ch >= 0x80:
-                flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR)
-            rutf8.unichr_as_utf8_append(builder, ch, allow_surrogates=True)
-        return W_UnicodeObject(builder.build(), self._length, flag)
+            builder.append_code(ch)
+        return self.from_utf8builder(builder)
 
     def descr_title(self, space):
         if len(self._utf8) == 0:
             return self
-        utf8, flag = self.title_unicode(self._utf8)
-        return W_UnicodeObject(utf8, self._len(), flag)
+        return self.title_unicode(self._utf8)
 
     @jit.elidable
     def title_unicode(self, value):
         input = self._utf8
-        builder = StringBuilder(len(input))
-        i = 0
+        builder = rutf8.Utf8StringBuilder(len(input))
         previous_is_cased = False
-        flag = self._get_flag()
-        while i < len(input):
-            ch = rutf8.codepoint_at_pos(input, i)
-            i = rutf8.next_codepoint_pos(input, i)
+        for ch in rutf8.Utf8StringIterator(input):
             if not previous_is_cased:
                 ch = unicodedb.totitle(ch)
             else:
                 ch = unicodedb.tolower(ch)
-            if ch >= 0x80:
-                flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR)
-            rutf8.unichr_as_utf8_append(builder, ch, allow_surrogates=True)
+            builder.append_code(ch)
             previous_is_cased = unicodedb.iscased(ch)
-        return builder.build(), flag
+        return self.from_utf8builder(builder)
 
     def descr_translate(self, space, w_table):
-        input = self._utf8
-        result = StringBuilder(len(input))
-        result_length = 0
-        flag = self._get_flag()
-        i = 0
-        while i < len(input):
-            codepoint = rutf8.codepoint_at_pos(input, i)
-            i = rutf8.next_codepoint_pos(input, i)
+        builder = rutf8.Utf8StringBuilder(len(self._utf8))
+        for codepoint in rutf8.Utf8StringIterator(self._utf8):
             try:
                 w_newval = space.getitem(w_table, space.newint(codepoint))
             except OperationError as e:
@@ -406,24 +392,19 @@
                 elif space.isinstance_w(w_newval, space.w_int):
                     codepoint = space.int_w(w_newval)
                 elif isinstance(w_newval, W_UnicodeObject):
-                    result.append(w_newval._utf8)
-                    flag = rutf8.combine_flags(flag, w_newval._get_flag())
-                    result_length += w_newval._length
+                    builder.append_utf8(
+                        w_newval._utf8, w_newval._length, w_newval._get_flag())
                     continue
                 else:
                     raise oefmt(space.w_TypeError,
                                 "character mapping must return integer, None "
                                 "or unicode")
             try:
-                if codepoint >= 0x80:
-                    flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR)
-                rutf8.unichr_as_utf8_append(result, codepoint,
-                                            allow_surrogates=True)
-                result_length += 1
+                builder.append_code(codepoint)
             except ValueError:
                 raise oefmt(space.w_TypeError,
                             "character mapping must be in range(0x110000)")
-        return W_UnicodeObject(result.build(), result_length, flag)
+        return self.from_utf8builder(builder)
 
     def descr_find(self, space, w_sub, w_start=None, w_end=None):
         w_result = self._unwrap_and_search(space, w_sub, w_start, w_end)
@@ -534,16 +515,11 @@
         return tformat.formatter_field_name_split()
 
     def descr_lower(self, space):
-        builder = StringBuilder(len(self._utf8))
-        pos = 0
-        flag = self._get_flag()
-        while pos < len(self._utf8):
-            lower = unicodedb.tolower(rutf8.codepoint_at_pos(self._utf8, pos))
-            if lower >= 0x80:
-                flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR)
-            rutf8.unichr_as_utf8_append(builder, lower, allow_surrogates=True)
-            pos = rutf8.next_codepoint_pos(self._utf8, pos)
-        return W_UnicodeObject(builder.build(), self._len(), flag)
+        builder = rutf8.Utf8StringBuilder(len(self._utf8))
+        for ch in rutf8.Utf8StringIterator(self._utf8):
+            lower = unicodedb.tolower(ch)
+            builder.append_code(lower)
+        return self.from_utf8builder(builder)
 
     def descr_isdecimal(self, space):
         return self._is_generic(space, '_isdecimal')
@@ -711,18 +687,11 @@
         return space.newlist(strs_w)
 
     def descr_upper(self, space):
-        value = self._utf8
-        builder = StringBuilder(len(value))
-        flag = self._get_flag()
-        i = 0
-        while i < len(value):
-            uchar = rutf8.codepoint_at_pos(value, i)
-            uchar = unicodedb.toupper(uchar)
-            if uchar >= 0x80:
-                flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR)
-            i = rutf8.next_codepoint_pos(value, i)
-            rutf8.unichr_as_utf8_append(builder, uchar, allow_surrogates=True)
-        return W_UnicodeObject(builder.build(), self._length, flag)
+        builder = rutf8.Utf8StringBuilder(len(self._utf8))
+        for ch in rutf8.Utf8StringIterator(self._utf8):
+            ch = unicodedb.toupper(ch)
+            builder.append_code(ch)
+        return self.from_utf8builder(builder)
 
     @unwrap_spec(width=int)
     def descr_zfill(self, space, width):
@@ -826,22 +795,15 @@
         if len(value) == 0:
             return self._empty()
 
-        flag = self._get_flag()
-        builder = StringBuilder(len(value))
-        uchar = rutf8.codepoint_at_pos(value, 0)
-        i = rutf8.next_codepoint_pos(value, 0)
+        builder = rutf8.Utf8StringBuilder(len(self._utf8))
+        it = rutf8.Utf8StringIterator(self._utf8)
+        uchar = it.next()
         ch = unicodedb.toupper(uchar)
-        rutf8.unichr_as_utf8_append(builder, ch, allow_surrogates=True)
-        if ch >= 0x80:
-            flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR)
-        while i < len(value):
-            uchar = rutf8.codepoint_at_pos(value, i)
-            i = rutf8.next_codepoint_pos(value, i)
-            ch = unicodedb.tolower(uchar)
-            rutf8.unichr_as_utf8_append(builder, ch, allow_surrogates=True)
-            if ch >= 0x80:
-                flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR)
-        return W_UnicodeObject(builder.build(), self._len(), flag)
+        builder.append_code(ch)
+        for ch in it:
+            ch = unicodedb.tolower(ch)
+            builder.append_code(ch)
+        return self.from_utf8builder(builder)
 
     @unwrap_spec(width=int, w_fillchar=WrappedDefault(' '))
     def descr_center(self, space, width, w_fillchar):
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy unicode-utf8: replace a lot of uses of StringBuilder by Utf8StringBuilder

Reply via email to