Author: fijal
Branch: unicode-utf8
Changeset: r93138:9ede67aee27e
Date: 2017-11-23 15:49 +0100
http://bitbucket.org/pypy/pypy/changeset/9ede67aee27e/

Log:    Utf8StringBuilder

diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -16,9 +16,11 @@
 """
 
 import sys
-from rpython.rlib.objectmodel import enforceargs, we_are_translated
+from rpython.rlib.objectmodel import enforceargs, we_are_translated, specialize
 from rpython.rlib.rstring import StringBuilder
 from rpython.rlib import jit
+from rpython.rlib.signature import signature
+from rpython.rlib.types import char, none
 from rpython.rlib.rarithmetic import r_uint
 from rpython.rlib.unicodedata import unicodedb
 from rpython.rtyper.lltypesystem import lltype, rffi
@@ -316,6 +318,11 @@
         return res, flag
     raise CheckError(~res)
 
+def get_utf8_length_flag(s):
+    """ Get the length and flag out of valid utf8. For now just calls 
check_utf8
+    """
+    return check_utf8(s, True)
+
 @jit.elidable
 def _check_utf8(s, allow_surrogates, start, stop):
     pos = start
@@ -655,6 +662,53 @@
 
     return unicode_escape #, char_escape_helper
 
+class Utf8StringBuilder(object):
+    def __init__(self, size=0):
+        self._s = StringBuilder(size)
+        self._lgt = 0
+        self._flag = FLAG_ASCII
+
+    def append(self, s):
+        # for strings
+        self._s.append(s)
+        newlgt, newflag = get_utf8_length_flag(s)
+        self._lgt += newlgt
+        self._flag = combine_flags(self._flag, newflag)
+
+    @signature(char(), returns=none())
+    def append_char(self, s):
+        # for characters, ascii
+        self._lgt += 1
+        self._s.append(s)
+
+    def append_code(self, code):
+        self._flag = combine_flags(self._flag, get_flag_from_code(code))
+        self._lgt += 1
+        unichr_as_utf8_append(self._s, code, True)
+
+    def build(self):
+        return self._s.build()
+
+    def get_flag(self):
+        return self._flag
+
+    def get_length(self):
+        return self._lgt
+
+class Utf8StringIterator(object):
+    def __init__(self, utf8s):
+        self._utf8 = utf8s
+        self._end = len(utf8s)
+        self._pos = 0
+
+    def done(self):
+        return self._pos == self._end
+
+    def next(self):
+        ret = codepoint_at_pos(self._utf8, self._pos)
+        self._pos = next_codepoint_pos(self._utf8, self._pos)
+        return ret
+
 def decode_latin_1(s):
     if len(s) == 0:
         return s
diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py
--- a/rpython/rlib/test/test_rutf8.py
+++ b/rpython/rlib/test/test_rutf8.py
@@ -139,3 +139,39 @@
     result = rutf8.surrogate_in_utf8(uni)
     expected = any(uch for uch in unichars if u'\ud800' <= uch <= u'\udfff')
     assert result == expected
+
+@given(strategies.text())
+def test_get_utf8_length_flag(u):
+    exp_lgt = len(u)
+    exp_flag = rutf8.FLAG_ASCII
+    for c in u:
+        if ord(c) > 0x7F:
+            exp_flag = rutf8.FLAG_REGULAR
+    lgt, flag = rutf8.get_utf8_length_flag(u.encode('utf8'))
+    assert lgt == exp_lgt
+    assert flag == exp_flag
+
+def test_utf8_string_builder():
+    s = rutf8.Utf8StringBuilder()
+    s.append("foo")
+    s.append_char("x")
+    assert s.get_flag() == rutf8.FLAG_ASCII
+    assert s.get_length() == 4
+    assert s.build() == "foox"
+    s.append(u"\u1234".encode("utf8"))
+    assert s.get_flag() == rutf8.FLAG_REGULAR
+    assert s.get_length() == 5
+    assert s.build().decode("utf8") == u"foox\u1234"
+    s.append("foo")
+    s.append_char("x")
+    assert s.get_flag() == rutf8.FLAG_REGULAR
+    assert s.get_length() == 9
+    assert s.build().decode("utf8") == u"foox\u1234foox"
+    s = rutf8.Utf8StringBuilder()
+    s.append_code(0x1234)
+    assert s.build().decode("utf8") == u"\u1234"
+    assert s.get_flag() == rutf8.FLAG_REGULAR
+    assert s.get_length() == 1
+    s.append_code(0xD800)
+    assert s.get_flag() == rutf8.FLAG_HAS_SURROGATES
+    assert s.get_length() == 2
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to