Author: Ronan Lamy <[email protected]>
Branch: unicode-utf8
Changeset: r93398:a31a10da4e39
Date: 2017-12-12 19:36 +0000
http://bitbucket.org/pypy/pypy/changeset/a31a10da4e39/

Log:    UnicodeBuilder should return unicode; fix rutf8.check_utf8() return
        value

diff --git a/pypy/module/__pypy__/interp_builders.py 
b/pypy/module/__pypy__/interp_builders.py
--- a/pypy/module/__pypy__/interp_builders.py
+++ b/pypy/module/__pypy__/interp_builders.py
@@ -3,6 +3,7 @@
 from pypy.interpreter.gateway import interp2app, unwrap_spec
 from pypy.interpreter.typedef import TypeDef
 from rpython.rlib.rstring import StringBuilder
+from rpython.rlib.rutf8 import StringBuilder, Utf8StringBuilder
 from pypy.objspace.std.unicodeobject import W_UnicodeObject
 from rpython.tool.sourcetools import func_with_new_name
 
@@ -54,13 +55,13 @@
 class W_UnicodeBuilder(W_Root):
     def __init__(self, space, size):
         if size < 0:
-            self.builder = StringBuilder()
+            self.builder = Utf8StringBuilder()
         else:
-            self.builder = StringBuilder(size)
+            self.builder = Utf8StringBuilder(size)
 
     @unwrap_spec(size=int)
     def descr__new__(space, w_subtype, size=-1):
-        return W_UnicodeBuilder(space, size)
+        return W_UnicodeBuilder(space, 3 * size)
 
     @unwrap_spec(s='utf8')
     def descr_append(self, space, s):
@@ -76,7 +77,7 @@
         self.builder.append_slice(w_unicode._utf8, byte_start, byte_end)
 
     def descr_build(self, space):
-        w_s = space.newtext(self.builder.build())
+        w_s = space.newutf8(self.builder.build(), self.builder.get_length())
         # after build(), we can continue to append more strings
         # to the same builder.  This is supported since
         # 2ff5087aca28 in RPython.
diff --git a/pypy/module/__pypy__/test/test_builders.py 
b/pypy/module/__pypy__/test/test_builders.py
--- a/pypy/module/__pypy__/test/test_builders.py
+++ b/pypy/module/__pypy__/test/test_builders.py
@@ -9,9 +9,11 @@
         b.append(u"1")
         s = b.build()
         assert s == u"abc1231"
+        assert type(s) is unicode
         assert b.build() == s
         b.append(u"123")
         assert b.build() == s + u"123"
+        assert type(b.build()) is unicode
 
     def test_preallocate(self):
         from __pypy__.builders import UnicodeBuilder
@@ -20,6 +22,7 @@
         b.append(u"123")
         s = b.build()
         assert s == u"abc123"
+        assert type(s) is unicode
 
     def test_append_slice(self):
         from __pypy__.builders import UnicodeBuilder
@@ -28,8 +31,11 @@
         raises(ValueError, b.append_slice, u"1", 2, 1)
         s = b.build()
         assert s == u"cde"
+        assert type(s) is unicode
         b.append_slice(u"abc", 1, 2)
-        assert b.build() == u"cdeb"
+        s = b.build()
+        assert s == u"cdeb"
+        assert type(s) is unicode
 
     def test_stringbuilder(self):
         from __pypy__.builders import StringBuilder
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -407,10 +407,10 @@
             continue
 
         return ~(pos - 1)
-
     assert pos == end
-    assert pos - continuation_bytes >= 0
-    return pos - continuation_bytes
+    result = pos - continuation_bytes - start
+    assert result >= 0
+    return result
 
 def has_surrogates(utf8):
     # XXX write a faster version maybe
diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py
--- a/rpython/rlib/test/test_rutf8.py
+++ b/rpython/rlib/test/test_rutf8.py
@@ -29,7 +29,6 @@
     else:
         assert not raised
 
-@settings(max_examples=10000)
 @given(strategies.binary(), strategies.booleans())
 @example('\xf1\x80\x80\x80', False)
 def test_check_utf8(s, allow_surrogates):
@@ -39,6 +38,13 @@
 def test_check_utf8_valid(u, allow_surrogates):
     _test_check_utf8(u.encode('utf-8'), allow_surrogates)
 
+@given(strategies.binary(), strategies.text(), strategies.binary())
+def test_check_utf8_slice(a, b, c):
+    start = len(a)
+    b_utf8 = b.encode('utf-8')
+    end = start + len(b_utf8)
+    assert rutf8.check_utf8(a + b_utf8 + c, False, start, end) == len(b)
+
 def _has_surrogates(s):
     for u in s.decode('utf8'):
         if 0xD800 <= ord(u) <= 0xDFFF:
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to