Author: fijal
Branch: unicode-utf8
Changeset: r90546:e0e41208baf4
Date: 2017-03-04 22:55 +0100
http://bitbucket.org/pypy/pypy/changeset/e0e41208baf4/
Log: implement isspace as a regex
diff --git a/pypy/objspace/std/unicodeobject.py
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -194,7 +194,7 @@
return unicodedb.iscased(ord(ch))
def _islinebreak(self, s, pos):
- return rutf8.check_newline_utf8(s, pos)
+ return rutf8.islinebreak(s, pos)
def _upper(self, ch):
return unichr(unicodedb.toupper(ord(ch)))
@@ -668,6 +668,25 @@
return W_UnicodeObject(value, self._len())
+ def _strip_none(self, space, left, right):
+ "internal function called by str_xstrip methods"
+ value = self._utf8
+
+ lpos = 0
+ rpos = self._len()
+
+ if left:
+ while lpos < rpos and self._isspace(value[lpos]):
+ lpos += 1
+
+ if right:
+ while rpos > lpos and self._isspace(value[rpos - 1]):
+ rpos -= 1
+
+ assert rpos >= lpos # annotator hint, don't remove
+ return self._sliced(space, value, lpos, rpos, self)
+
+
def descr_getnewargs(self, space):
return space.newtuple([W_UnicodeObject(self._utf8, self._length)])
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -68,7 +68,7 @@
chr1 = ord(code[pos])
if chr1 < 0x80:
return pos + 1
- if 0xC2 >= chr1 <= 0xDF:
+ if 0xC2 <= chr1 <= 0xDF:
return pos + 2
if chr1 >= 0xE0 and chr1 <= 0xEF:
return pos + 3
@@ -165,7 +165,7 @@
pos += 1
return result.build(), pos, -1
-def check_newline_utf8(s, pos):
+def islinebreak(s, pos):
chr1 = ord(s[pos])
if 0xa <= chr1 <= 0xd:
return True
@@ -182,6 +182,41 @@
return chr3 == 0xa8 or chr3 == 0xa9
return False
+def isspace(s, pos):
+ chr1 = ord(s[pos])
+ if (chr1 == ord(' ') or chr1 == ord('\n') or chr1 == ord('\t') or
+ chr1 == ord('\r')):
+ return True # common
+ if chr1 == 0x0b or chr1 == 0x0c or (chr1 >= 0x1c and chr1 <= 0x1f):
+ return True # less common
+ if chr1 < 0x80:
+ return False
+ # obscure cases
+ chr2 = ord(s[pos + 1])
+ if chr1 == 0xc2:
+ return chr2 == 0x85 or chr2 == 0xa0
+ if chr1 == 0xe2:
+ if chr2 == 0x81 and s[pos + 2] == '\x9f':
+ return True
+ if chr2 != 0x80:
+ return False
+ chr3 = ord(s[pos + 2])
+ if chr3 >= 0x80 and chr3 <= 0x8a:
+ return True
+ if chr3 == 0xa9 or chr3 == 0xa8 or chr3 == 0xaf:
+ return True
+ return False
+ if chr1 == 0xe1:
+ chr3 = ord(s[pos + 2])
+ if chr2 == 0x9a and chr3 == 0x80:
+ return True
+ if chr2 == 0xa0 and chr3 == 0x8e:
+ return True
+ return False
+ if chr1 == 0xe3 and chr2 == 0x80 and s[pos + 2] == '\x80':
+ return True
+ return False
+
class Utf8CheckError(Exception):
def __init__(self, msg, startpos, endpos):
self.msg = msg
diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py
--- a/rpython/rlib/test/test_rutf8.py
+++ b/rpython/rlib/test/test_rutf8.py
@@ -60,6 +60,13 @@
def test_check_newline_utf8():
for i in xrange(sys.maxunicode):
if runicode.unicodedb.islinebreak(i):
- assert rutf8.check_newline_utf8(unichr(i).encode('utf8'), 0)
+ assert rutf8.islinebreak(unichr(i).encode('utf8'), 0)
else:
- assert not rutf8.check_newline_utf8(unichr(i).encode('utf8'), 0)
+ assert not rutf8.islinebreak(unichr(i).encode('utf8'), 0)
+
+def test_isspace_utf8():
+ for i in xrange(sys.maxunicode):
+ if runicode.unicodedb.isspace(i):
+ assert rutf8.isspace(unichr(i).encode('utf8'), 0)
+ else:
+ assert not rutf8.isspace(unichr(i).encode('utf8'), 0)
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit