[pypy-commit] pypy unicode-utf8: implement isspace as a regex

fijal Sat, 04 Mar 2017 14:03:06 -0800

Author: fijal
Branch: unicode-utf8
Changeset: r90546:e0e41208baf4
Date: 2017-03-04 22:55 +0100
http://bitbucket.org/pypy/pypy/changeset/e0e41208baf4/


Log:    implement isspace as a regex

diff --git a/pypy/objspace/std/unicodeobject.py 
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -194,7 +194,7 @@
         return unicodedb.iscased(ord(ch))
 
     def _islinebreak(self, s, pos):
-        return rutf8.check_newline_utf8(s, pos)
+        return rutf8.islinebreak(s, pos)
 
     def _upper(self, ch):
         return unichr(unicodedb.toupper(ord(ch)))
@@ -668,6 +668,25 @@
 
         return W_UnicodeObject(value, self._len())
 
+    def _strip_none(self, space, left, right):
+        "internal function called by str_xstrip methods"
+        value = self._utf8
+
+        lpos = 0
+        rpos = self._len()
+
+        if left:
+            while lpos < rpos and self._isspace(value[lpos]):
+                lpos += 1
+
+        if right:
+            while rpos > lpos and self._isspace(value[rpos - 1]):
+                rpos -= 1
+
+        assert rpos >= lpos    # annotator hint, don't remove
+        return self._sliced(space, value, lpos, rpos, self)
+
+
     def descr_getnewargs(self, space):
         return space.newtuple([W_UnicodeObject(self._utf8, self._length)])
 
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -68,7 +68,7 @@
     chr1 = ord(code[pos])
     if chr1 < 0x80:
         return pos + 1
-    if 0xC2 >= chr1 <= 0xDF:
+    if 0xC2 <= chr1 <= 0xDF:
         return pos + 2
     if chr1 >= 0xE0 and chr1 <= 0xEF:
         return pos + 3
@@ -165,7 +165,7 @@
         pos += 1
     return result.build(), pos, -1
 
-def check_newline_utf8(s, pos):
+def islinebreak(s, pos):
     chr1 = ord(s[pos])
     if 0xa <= chr1 <= 0xd:
         return True
@@ -182,6 +182,41 @@
         return chr3 == 0xa8 or chr3 == 0xa9
     return False
 
+def isspace(s, pos):
+    chr1 = ord(s[pos])
+    if (chr1 == ord(' ') or chr1 == ord('\n') or chr1 == ord('\t') or
+        chr1 == ord('\r')):
+        return True # common
+    if chr1 == 0x0b or chr1 == 0x0c or (chr1 >= 0x1c and chr1 <= 0x1f):
+        return True # less common
+    if chr1 < 0x80:
+        return False
+    # obscure cases
+    chr2 = ord(s[pos + 1])
+    if chr1 == 0xc2:
+        return chr2 == 0x85 or chr2 == 0xa0
+    if chr1 == 0xe2:
+        if chr2 == 0x81 and s[pos + 2] == '\x9f':
+            return True
+        if chr2 != 0x80:
+            return False
+        chr3 = ord(s[pos + 2])
+        if chr3 >= 0x80 and chr3 <= 0x8a:
+            return True
+        if chr3 == 0xa9 or chr3 == 0xa8 or chr3 == 0xaf:
+            return True
+        return False
+    if chr1 == 0xe1:
+        chr3 = ord(s[pos + 2])
+        if chr2 == 0x9a and chr3 == 0x80:
+            return True
+        if chr2 == 0xa0 and chr3 == 0x8e:
+            return True
+        return False
+    if chr1 == 0xe3 and chr2 == 0x80 and s[pos + 2] == '\x80':
+        return True
+    return False
+
 class Utf8CheckError(Exception):
     def __init__(self, msg, startpos, endpos):
         self.msg = msg
diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py
--- a/rpython/rlib/test/test_rutf8.py
+++ b/rpython/rlib/test/test_rutf8.py
@@ -60,6 +60,13 @@
 def test_check_newline_utf8():
     for i in xrange(sys.maxunicode):
         if runicode.unicodedb.islinebreak(i):
-            assert rutf8.check_newline_utf8(unichr(i).encode('utf8'), 0)
+            assert rutf8.islinebreak(unichr(i).encode('utf8'), 0)
         else:
-            assert not rutf8.check_newline_utf8(unichr(i).encode('utf8'), 0)
+            assert not rutf8.islinebreak(unichr(i).encode('utf8'), 0)
+
+def test_isspace_utf8():
+    for i in xrange(sys.maxunicode):
+        if runicode.unicodedb.isspace(i):
+            assert rutf8.isspace(unichr(i).encode('utf8'), 0)
+        else:
+            assert not rutf8.isspace(unichr(i).encode('utf8'), 0)
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy unicode-utf8: implement isspace as a regex

Reply via email to