Author: Jeremy Thurgood <[email protected]>
Branch: unicode-utf8
Changeset: r92631:842f2cbd6d78
Date: 2017-10-07 14:54 +0200
http://bitbucket.org/pypy/pypy/changeset/842f2cbd6d78/
Log: expandtabs and [is]title.
diff --git a/pypy/objspace/std/test/test_unicodeobject.py
b/pypy/objspace/std/test/test_unicodeobject.py
--- a/pypy/objspace/std/test/test_unicodeobject.py
+++ b/pypy/objspace/std/test/test_unicodeobject.py
@@ -230,6 +230,7 @@
assert u"bROWN fOX".title() == u"Brown Fox"
assert u"Brown Fox".title() == u"Brown Fox"
assert u"bro!wn fox".title() == u"Bro!Wn Fox"
+ assert u"brow\u4321n fox".title() == u"Brow\u4321N Fox"
def test_istitle(self):
assert u"".istitle() == False
diff --git a/pypy/objspace/std/unicodeobject.py
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -5,6 +5,7 @@
enforceargs, newlist_hint, specialize, we_are_translated)
from rpython.rlib.buffer import StringBuffer
from rpython.rlib.mutbuffer import MutableStringBuffer
+from rpython.rlib.rarithmetic import ovfcheck
from rpython.rlib.rstring import StringBuilder, split, rsplit, UnicodeBuilder,\
replace_count
from rpython.rlib.runicode import make_unicode_escape_function
@@ -349,6 +350,28 @@
def descr_rmod(self, space, w_values):
return mod_format(space, w_values, self, do_unicode=True)
+ def descr_title(self, space):
+ if len(self._utf8) == 0:
+ return self
+ return W_UnicodeObject(self.title(self._utf8), self._len())
+
+ @jit.elidable
+ def title(self, value):
+ input = self._utf8
+ builder = StringBuilder(len(input))
+ i = 0
+ previous_is_cased = False
+ while i < len(input):
+ ch = rutf8.codepoint_at_pos(input, i)
+ i = rutf8.next_codepoint_pos(input, i)
+ if not previous_is_cased:
+ ch = unicodedb.totitle(ch)
+ else:
+ ch = unicodedb.tolower(ch)
+ rutf8.unichr_as_utf8_append(builder, ch)
+ previous_is_cased = unicodedb.iscased(ch)
+ return builder.build()
+
def descr_translate(self, space, w_table):
input = self._utf8
result = StringBuilder(len(input))
@@ -389,6 +412,30 @@
w_errors)
return encode_object(space, self, encoding, errors)
+ @unwrap_spec(tabsize=int)
+ def descr_expandtabs(self, space, tabsize=8):
+ value = self._utf8
+ if not value:
+ return self._empty()
+
+ splitted = value.split('\t')
+
+ try:
+ if tabsize > 0:
+ ovfcheck(len(splitted) * tabsize)
+ except OverflowError:
+ raise oefmt(space.w_OverflowError, "new string is too long")
+ expanded = oldtoken = splitted.pop(0)
+ newlen = self._len() - len(splitted)
+
+ for token in splitted:
+ dist = self._tabindent(oldtoken, tabsize)
+ expanded += ' ' * dist + token
+ newlen += dist
+ oldtoken = token
+
+ return W_UnicodeObject(expanded, newlen)
+
_StringMethods_descr_join = descr_join
def descr_join(self, space, w_list):
l = space.listview_unicode(w_list)
@@ -438,6 +485,27 @@
i = rutf8.next_codepoint_pos(val, i)
return space.newbool(cased)
+ def descr_istitle(self, space):
+ cased = False
+ previous_is_cased = False
+ val = self._utf8
+ i = 0
+ while i < len(val):
+ uchar = rutf8.codepoint_at_pos(val, i)
+ i = rutf8.next_codepoint_pos(val, i)
+ if unicodedb.isupper(uchar) or unicodedb.istitle(uchar):
+ if previous_is_cased:
+ return space.w_False
+ previous_is_cased = True
+ cased = True
+ elif unicodedb.islower(uchar):
+ if not previous_is_cased:
+ return space.w_False
+ cased = True
+ else:
+ previous_is_cased = False
+ return space.newbool(cased)
+
def descr_isupper(self, space):
cased = False
i = 0
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit