Author: fijal
Branch: unicode-utf8
Changeset: r90539:4c1a6dc397c4
Date: 2017-03-04 19:39 +0100
http://bitbucket.org/pypy/pypy/changeset/4c1a6dc397c4/
Log: some progress, still errors in rsplit (we need to think!)
diff --git a/rpython/rlib/rstring.py b/rpython/rlib/rstring.py
--- a/rpython/rlib/rstring.py
+++ b/rpython/rlib/rstring.py
@@ -37,7 +37,16 @@
if isutf8:
return next_codepoint_pos(s, pos)
else:
- return pos + 1
+ return pos + 1
+
[email protected]_and_arg(2)
+def _decr(s, pos, isutf8):
+ from rpython.rlib.rutf8 import prev_codepoint_pos
+
+ if isutf8:
+ return prev_codepoint_pos(s, pos)
+ else:
+ return pos - 1
@specialize.ll_and_arg(3)
def split(value, by=None, maxsplit=-1, isutf8=0):
@@ -132,7 +141,7 @@
while i >= 0:
if not _isspace(value, i):
break # found
- i -= 1
+ i = _decr(value, i, isutf8)
else:
break # end of string, finished
@@ -141,18 +150,21 @@
if maxsplit == 0:
j = -1 # take all the rest of the string
else:
- j = i - 1
+ j = _decr(value, i, isutf8)
while j >= 0 and not _isspace(value, j):
- j -= 1
+ j = _decr(value, j, isutf8)
maxsplit -= 1 # NB. if it's already < 0, it stays < 0
# the word is value[j+1:i+1]
- j1 = j + 1
+ if j < 0:
+ j1 = 0
+ else:
+ j1 = _incr(value, j, isutf8)
assert j1 >= 0
res.append(value[j1:i+1])
# continue to look from the character before the space before the
word
- i = j - 1
+ i = _decr(value, j, isutf8)
res.reverse()
return res
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -57,6 +57,10 @@
return lgt
raise ValueError
+# note - table lookups are really slow. Measured on various elements of obama
+# chinese wikipedia, they're anywhere between 10% and 30% slower.
+# In extreme cases (small, only chinese text), they're 40% slower
+
def next_codepoint_pos(code, pos):
""" Gives the position of the next codepoint after pos, -1
if it's the last one (assumes valid utf8)
@@ -64,7 +68,21 @@
chr1 = ord(code[pos])
if chr1 < 0x80:
return pos + 1
- return pos + ord(runicode._utf8_code_length[chr1 - 0x80])
+ if 0xC2 >= chr1 <= 0xDF:
+ return pos + 2
+ if chr1 >= 0xE0 and chr1 <= 0xEF:
+ return pos + 3
+ return pos + 4
+
+def prev_codepoint_pos(code, pos):
+ """ Gives the position of the previous codepoint
+ """
+ chr1 = ord(code[pos])
+ if chr1 < 0x80:
+ return pos - 1
+ while ord(code[pos]) & 0xC0 == 0xC0:
+ pos -= 1
+ return pos
def compute_length_utf8(s):
pos = 0
diff --git a/rpython/rlib/test/test_rstring.py
b/rpython/rlib/test/test_rstring.py
--- a/rpython/rlib/test/test_rstring.py
+++ b/rpython/rlib/test/test_rstring.py
@@ -43,6 +43,12 @@
assert split(u'endcase test', u'test') == [u'endcase ', u'']
py.test.raises(ValueError, split, u'abc', u'')
+def test_split_utf8():
+ assert split('', 'a', isutf8=1) == ['']
+ assert split('baba', 'a', isutf8=1) == ['b', 'b', '']
+ assert split('b b', isutf8=1) == ['b', 'b']
+ assert split('b\xe1\x9a\x80b', isutf8=1) == ['b', 'b']
+
def test_rsplit():
def check_rsplit(value, sub, *args, **kwargs):
result = kwargs['res']
@@ -77,6 +83,12 @@
assert rsplit(u'endcase test', u'test') == [u'endcase ', u'']
py.test.raises(ValueError, rsplit, u"abc", u'')
+def test_rsplit_utf8():
+ assert rsplit('', 'a', isutf8=1) == ['']
+ assert rsplit('baba', 'a', isutf8=1) == ['b', 'b', '']
+ assert rsplit('b b', isutf8=1) == ['b', 'b']
+ assert rsplit('b\xe1\x9a\x80b', isutf8=1) == ['b', 'b']
+
def test_string_replace():
def check_replace(value, sub, *args, **kwargs):
result = kwargs['res']
diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py
--- a/rpython/rlib/test/test_rutf8.py
+++ b/rpython/rlib/test/test_rutf8.py
@@ -25,9 +25,6 @@
else:
assert not raised
-def error_handler(errors, encoding, msg, char, start, end):
- raise UnicodeDecodeError(encoding, char, start, end, msg)
-
@given(strategies.binary())
def test_str_check_utf8(s):
try:
@@ -36,11 +33,10 @@
except UnicodeDecodeError as e:
valid = False
try:
- consumed, length = rutf8.str_check_utf8(s, len(s), None,
- errorhandler=error_handler, final=True)
- except UnicodeDecodeError as a:
+ consumed, length = rutf8.str_check_utf8(s, len(s), final=True)
+ except rutf8.Utf8CheckError as a:
assert not valid
- assert a.start == e.start
+ assert a.startpos == e.start
# assert a.end == e.end, ideally
else:
assert valid
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit