[pypy-commit] pypy unicode-utf8: some progress, still errors in rsplit (we need to think!)

fijal Sat, 04 Mar 2017 10:40:59 -0800

Author: fijal
Branch: unicode-utf8
Changeset: r90539:4c1a6dc397c4
Date: 2017-03-04 19:39 +0100
http://bitbucket.org/pypy/pypy/changeset/4c1a6dc397c4/


Log:    some progress, still errors in rsplit (we need to think!)

diff --git a/rpython/rlib/rstring.py b/rpython/rlib/rstring.py
--- a/rpython/rlib/rstring.py
+++ b/rpython/rlib/rstring.py
@@ -37,7 +37,16 @@
     if isutf8:
         return next_codepoint_pos(s, pos)
     else:
-        return pos + 1        
+        return pos + 1
+
[email protected]_and_arg(2)
+def _decr(s, pos, isutf8):
+    from rpython.rlib.rutf8 import prev_codepoint_pos
+
+    if isutf8:
+        return prev_codepoint_pos(s, pos)
+    else:
+        return pos - 1
 
 @specialize.ll_and_arg(3)
 def split(value, by=None, maxsplit=-1, isutf8=0):
@@ -132,7 +141,7 @@
             while i >= 0:
                 if not _isspace(value, i):
                     break   # found
-                i -= 1
+                i = _decr(value, i, isutf8)
             else:
                 break  # end of string, finished
 
@@ -141,18 +150,21 @@
             if maxsplit == 0:
                 j = -1   # take all the rest of the string
             else:
-                j = i - 1
+                j = _decr(value, i, isutf8)
                 while j >= 0 and not _isspace(value, j):
-                    j -= 1
+                    j = _decr(value, j, isutf8)
                 maxsplit -= 1   # NB. if it's already < 0, it stays < 0
 
             # the word is value[j+1:i+1]
-            j1 = j + 1
+            if j < 0:
+                j1 = 0
+            else:
+                j1 = _incr(value, j, isutf8)
             assert j1 >= 0
             res.append(value[j1:i+1])
 
             # continue to look from the character before the space before the 
word
-            i = j - 1
+            i = _decr(value, j, isutf8)
 
         res.reverse()
         return res
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -57,6 +57,10 @@
         return lgt
     raise ValueError
 
+# note - table lookups are really slow. Measured on various elements of obama
+#        chinese wikipedia, they're anywhere between 10% and 30% slower.
+#        In extreme cases (small, only chinese text), they're 40% slower
+
 def next_codepoint_pos(code, pos):
     """ Gives the position of the next codepoint after pos, -1
     if it's the last one (assumes valid utf8)
@@ -64,7 +68,21 @@
     chr1 = ord(code[pos])
     if chr1 < 0x80:
         return pos + 1
-    return pos + ord(runicode._utf8_code_length[chr1 - 0x80])
+    if 0xC2 >= chr1 <= 0xDF:
+        return pos + 2
+    if chr1 >= 0xE0 and chr1 <= 0xEF:
+        return pos + 3
+    return pos + 4
+
+def prev_codepoint_pos(code, pos):
+    """ Gives the position of the previous codepoint
+    """
+    chr1 = ord(code[pos])
+    if chr1 < 0x80:
+        return pos - 1
+    while ord(code[pos]) & 0xC0 == 0xC0:
+        pos -= 1
+    return pos
 
 def compute_length_utf8(s):
     pos = 0
diff --git a/rpython/rlib/test/test_rstring.py 
b/rpython/rlib/test/test_rstring.py
--- a/rpython/rlib/test/test_rstring.py
+++ b/rpython/rlib/test/test_rstring.py
@@ -43,6 +43,12 @@
     assert split(u'endcase test', u'test') == [u'endcase ', u'']
     py.test.raises(ValueError, split, u'abc', u'')
 
+def test_split_utf8():
+    assert split('', 'a', isutf8=1) == ['']
+    assert split('baba', 'a', isutf8=1) == ['b', 'b', '']
+    assert split('b b', isutf8=1) == ['b', 'b']
+    assert split('b\xe1\x9a\x80b', isutf8=1) == ['b', 'b']
+
 def test_rsplit():
     def check_rsplit(value, sub, *args, **kwargs):
         result = kwargs['res']
@@ -77,6 +83,12 @@
     assert rsplit(u'endcase test', u'test') == [u'endcase ', u'']
     py.test.raises(ValueError, rsplit, u"abc", u'')
 
+def test_rsplit_utf8():
+    assert rsplit('', 'a', isutf8=1) == ['']
+    assert rsplit('baba', 'a', isutf8=1) == ['b', 'b', '']
+    assert rsplit('b b', isutf8=1) == ['b', 'b']
+    assert rsplit('b\xe1\x9a\x80b', isutf8=1) == ['b', 'b']
+
 def test_string_replace():
     def check_replace(value, sub, *args, **kwargs):
         result = kwargs['res']
diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py
--- a/rpython/rlib/test/test_rutf8.py
+++ b/rpython/rlib/test/test_rutf8.py
@@ -25,9 +25,6 @@
     else:
         assert not raised
 
-def error_handler(errors, encoding, msg, char, start, end):
-    raise UnicodeDecodeError(encoding, char, start, end, msg)
-
 @given(strategies.binary())
 def test_str_check_utf8(s):
     try:
@@ -36,11 +33,10 @@
     except UnicodeDecodeError as e:
         valid = False
     try:
-        consumed, length = rutf8.str_check_utf8(s, len(s), None,
-            errorhandler=error_handler, final=True)
-    except UnicodeDecodeError as a:
+        consumed, length = rutf8.str_check_utf8(s, len(s), final=True)
+    except rutf8.Utf8CheckError as a:
         assert not valid
-        assert a.start == e.start
+        assert a.startpos == e.start
         # assert a.end == e.end, ideally
     else:
         assert valid
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy unicode-utf8: some progress, still errors in rsplit (we need to think!)

Reply via email to