Author: Ronan Lamy <[email protected]>
Branch: unicode-utf8
Changeset: r93398:a31a10da4e39
Date: 2017-12-12 19:36 +0000
http://bitbucket.org/pypy/pypy/changeset/a31a10da4e39/
Log: UnicodeBuilder should return unicode; fix rutf8.check_utf8() return
value
diff --git a/pypy/module/__pypy__/interp_builders.py
b/pypy/module/__pypy__/interp_builders.py
--- a/pypy/module/__pypy__/interp_builders.py
+++ b/pypy/module/__pypy__/interp_builders.py
@@ -3,6 +3,7 @@
from pypy.interpreter.gateway import interp2app, unwrap_spec
from pypy.interpreter.typedef import TypeDef
from rpython.rlib.rstring import StringBuilder
+from rpython.rlib.rutf8 import StringBuilder, Utf8StringBuilder
from pypy.objspace.std.unicodeobject import W_UnicodeObject
from rpython.tool.sourcetools import func_with_new_name
@@ -54,13 +55,13 @@
class W_UnicodeBuilder(W_Root):
def __init__(self, space, size):
if size < 0:
- self.builder = StringBuilder()
+ self.builder = Utf8StringBuilder()
else:
- self.builder = StringBuilder(size)
+ self.builder = Utf8StringBuilder(size)
@unwrap_spec(size=int)
def descr__new__(space, w_subtype, size=-1):
- return W_UnicodeBuilder(space, size)
+ return W_UnicodeBuilder(space, 3 * size)
@unwrap_spec(s='utf8')
def descr_append(self, space, s):
@@ -76,7 +77,7 @@
self.builder.append_slice(w_unicode._utf8, byte_start, byte_end)
def descr_build(self, space):
- w_s = space.newtext(self.builder.build())
+ w_s = space.newutf8(self.builder.build(), self.builder.get_length())
# after build(), we can continue to append more strings
# to the same builder. This is supported since
# 2ff5087aca28 in RPython.
diff --git a/pypy/module/__pypy__/test/test_builders.py
b/pypy/module/__pypy__/test/test_builders.py
--- a/pypy/module/__pypy__/test/test_builders.py
+++ b/pypy/module/__pypy__/test/test_builders.py
@@ -9,9 +9,11 @@
b.append(u"1")
s = b.build()
assert s == u"abc1231"
+ assert type(s) is unicode
assert b.build() == s
b.append(u"123")
assert b.build() == s + u"123"
+ assert type(b.build()) is unicode
def test_preallocate(self):
from __pypy__.builders import UnicodeBuilder
@@ -20,6 +22,7 @@
b.append(u"123")
s = b.build()
assert s == u"abc123"
+ assert type(s) is unicode
def test_append_slice(self):
from __pypy__.builders import UnicodeBuilder
@@ -28,8 +31,11 @@
raises(ValueError, b.append_slice, u"1", 2, 1)
s = b.build()
assert s == u"cde"
+ assert type(s) is unicode
b.append_slice(u"abc", 1, 2)
- assert b.build() == u"cdeb"
+ s = b.build()
+ assert s == u"cdeb"
+ assert type(s) is unicode
def test_stringbuilder(self):
from __pypy__.builders import StringBuilder
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -407,10 +407,10 @@
continue
return ~(pos - 1)
-
assert pos == end
- assert pos - continuation_bytes >= 0
- return pos - continuation_bytes
+ result = pos - continuation_bytes - start
+ assert result >= 0
+ return result
def has_surrogates(utf8):
# XXX write a faster version maybe
diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py
--- a/rpython/rlib/test/test_rutf8.py
+++ b/rpython/rlib/test/test_rutf8.py
@@ -29,7 +29,6 @@
else:
assert not raised
-@settings(max_examples=10000)
@given(strategies.binary(), strategies.booleans())
@example('\xf1\x80\x80\x80', False)
def test_check_utf8(s, allow_surrogates):
@@ -39,6 +38,13 @@
def test_check_utf8_valid(u, allow_surrogates):
_test_check_utf8(u.encode('utf-8'), allow_surrogates)
+@given(strategies.binary(), strategies.text(), strategies.binary())
+def test_check_utf8_slice(a, b, c):
+ start = len(a)
+ b_utf8 = b.encode('utf-8')
+ end = start + len(b_utf8)
+ assert rutf8.check_utf8(a + b_utf8 + c, False, start, end) == len(b)
+
def _has_surrogates(s):
for u in s.decode('utf8'):
if 0xD800 <= ord(u) <= 0xDFFF:
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit