[pypy-commit] pypy unicode-utf8: fix fix fix

fijal Thu, 02 Mar 2017 02:09:11 -0800

Author: fijal
Branch: unicode-utf8
Changeset: r90460:8f690010d092
Date: 2017-03-01 19:16 +0100
http://bitbucket.org/pypy/pypy/changeset/8f690010d092/


Log:    fix fix fix

diff --git a/pypy/objspace/std/unicodeobject.py 
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -42,15 +42,6 @@
         """representation for debugging purposes"""
         return "%s(%r)" % (self.__class__.__name__, self._utf8)
 
-    def unwrap(self, space):
-        # for testing
-        return self._value
-
-    def create_if_subclassed(self):
-        if type(self) is W_UnicodeObject:
-            return self
-        return W_UnicodeObject(self._value)
-
     def is_w(self, space, w_other):
         if not isinstance(w_other, W_UnicodeObject):
             return False
@@ -103,6 +94,7 @@
     charbuf_w = str_w
 
     def listview_unicode(self):
+        XXX # fix at some point
         return _create_list_from_unicode(self._value)
 
     def ord(self, space):
@@ -130,6 +122,8 @@
         return rutf8.compute_length_utf8(self._utf8)
 
     def _val(self, space):
+        import pdb
+        pdb.set_trace()
         return self._utf8.decode('utf8')
 
     @staticmethod
@@ -534,11 +528,10 @@
 
     @unwrap_spec(maxsplit=int)
     def descr_split(self, space, w_sep=None, maxsplit=-1):
-        # XXX maybe optimize?
         res = []
         value = self._utf8
         if space.is_none(w_sep):
-            res = split(value, maxsplit=maxsplit)
+            res = split(value, maxsplit=maxsplit, isutf8=1)
             return space.newlist_from_unicode(res)
 
         by = self.convert_arg_to_w_unicode(space, w_sep)._utf8
@@ -582,6 +575,12 @@
 
         return W_UnicodeObject(centered, self._len() + d)
 
+    def descr_contains(self, space, w_sub):
+        value = self._utf8
+        w_other = self.convert_arg_to_w_unicode(space, w_sub)
+        return space.newbool(value.find(w_other._utf8) >= 0)
+
+
 def wrapunicode(space, uni):
     return W_UnicodeObject(uni)
 
diff --git a/rpython/rlib/rstring.py b/rpython/rlib/rstring.py
--- a/rpython/rlib/rstring.py
+++ b/rpython/rlib/rstring.py
@@ -16,17 +16,31 @@
 
 # -------------- public API for string functions -----------------------
 
-@specialize.argtype(0)
-def _isspace(char):
-    if isinstance(char, str):
-        return char.isspace()
+@specialize.ll_and_arg(2)
+def _isspace(s, pos, isutf8=0):
+    if isutf8:
+        from rpython.rlib import rutf8
+        char = rutf8.codepoint_at_pos(s, pos)
+        return unicodedb.isspace(char)
     else:
-        assert isinstance(char, unicode)
-        return unicodedb.isspace(ord(char))
+        char = s[pos]
+        if isinstance(char, str):
+            return char.isspace()
+        else:
+            assert isinstance(char, unicode)
+            return unicodedb.isspace(ord(char))
 
+@specialize.arg(2)
+def _incr(s, pos, isutf8):
+    from rpython.rlib.rutf8 import next_codepoint_pos
 
-@specialize.argtype(0, 1)
-def split(value, by=None, maxsplit=-1):
+    if isutf8:
+        return next_codepoint_pos(s, pos)
+    else:
+        return pos + 1        
+
+@specialize.ll_and_arg(3)
+def split(value, by=None, maxsplit=-1, isutf8=0):
     if by is None:
         length = len(value)
         i = 0
@@ -34,9 +48,9 @@
         while True:
             # find the beginning of the next word
             while i < length:
-                if not _isspace(value[i]):
+                if not _isspace(value, i, isutf8):
                     break   # found
-                i += 1
+                i = _incr(value, i, isutf8)
             else:
                 break  # end of string, finished
 
@@ -44,16 +58,19 @@
             if maxsplit == 0:
                 j = length   # take all the rest of the string
             else:
-                j = i + 1
-                while j < length and not _isspace(value[j]):
-                    j += 1
+                j = _incr(value, i, isutf8)
+                while j < length and not _isspace(value, j, isutf8):
+                    j = _incr(value, j, isutf8)
                 maxsplit -= 1   # NB. if it's already < 0, it stays < 0
 
             # the word is value[i:j]
             res.append(value[i:j])
 
             # continue to look from the character following the space after 
the word
-            i = j + 1
+            if j < length:
+                i = _incr(value, j, isutf8)
+            else:
+                break
         return res
 
     if isinstance(value, unicode):
@@ -66,6 +83,8 @@
     bylen = len(by)
     if bylen == 0:
         raise ValueError("empty separator")
+    # XXX measure if preallocating the result list to the correct
+    #     size is faster, should be
 
     start = 0
     if bylen == 1:
@@ -102,8 +121,8 @@
     return res
 
 
-@specialize.argtype(0, 1)
-def rsplit(value, by=None, maxsplit=-1):
+@specialize.ll_and_arg(3)
+def rsplit(value, by=None, maxsplit=-1, isutf8=0):
     if by is None:
         res = []
 
diff --git a/rpython/rtyper/rpbc.py b/rpython/rtyper/rpbc.py
--- a/rpython/rtyper/rpbc.py
+++ b/rpython/rtyper/rpbc.py
@@ -1134,6 +1134,8 @@
         self.lowleveltype = self.r_im_self.lowleveltype
 
     def convert_const(self, method):
+        if method is None:
+            return nullptr(self.lowleveltype.TO)
         if getattr(method, 'im_func', None) is None:
             raise TyperError("not a bound method: %r" % method)
         return self.r_im_self.convert_const(method.im_self)
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy unicode-utf8: fix fix fix

Reply via email to