Author: fijal Branch: unicode-utf8 Changeset: r93138:9ede67aee27e Date: 2017-11-23 15:49 +0100 http://bitbucket.org/pypy/pypy/changeset/9ede67aee27e/
Log: Utf8StringBuilder diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py --- a/rpython/rlib/rutf8.py +++ b/rpython/rlib/rutf8.py @@ -16,9 +16,11 @@ """ import sys -from rpython.rlib.objectmodel import enforceargs, we_are_translated +from rpython.rlib.objectmodel import enforceargs, we_are_translated, specialize from rpython.rlib.rstring import StringBuilder from rpython.rlib import jit +from rpython.rlib.signature import signature +from rpython.rlib.types import char, none from rpython.rlib.rarithmetic import r_uint from rpython.rlib.unicodedata import unicodedb from rpython.rtyper.lltypesystem import lltype, rffi @@ -316,6 +318,11 @@ return res, flag raise CheckError(~res) +def get_utf8_length_flag(s): + """ Get the length and flag out of valid utf8. For now just calls check_utf8 + """ + return check_utf8(s, True) + @jit.elidable def _check_utf8(s, allow_surrogates, start, stop): pos = start @@ -655,6 +662,53 @@ return unicode_escape #, char_escape_helper +class Utf8StringBuilder(object): + def __init__(self, size=0): + self._s = StringBuilder(size) + self._lgt = 0 + self._flag = FLAG_ASCII + + def append(self, s): + # for strings + self._s.append(s) + newlgt, newflag = get_utf8_length_flag(s) + self._lgt += newlgt + self._flag = combine_flags(self._flag, newflag) + + @signature(char(), returns=none()) + def append_char(self, s): + # for characters, ascii + self._lgt += 1 + self._s.append(s) + + def append_code(self, code): + self._flag = combine_flags(self._flag, get_flag_from_code(code)) + self._lgt += 1 + unichr_as_utf8_append(self._s, code, True) + + def build(self): + return self._s.build() + + def get_flag(self): + return self._flag + + def get_length(self): + return self._lgt + +class Utf8StringIterator(object): + def __init__(self, utf8s): + self._utf8 = utf8s + self._end = len(utf8s) + self._pos = 0 + + def done(self): + return self._pos == self._end + + def next(self): + ret = codepoint_at_pos(self._utf8, self._pos) + self._pos = next_codepoint_pos(self._utf8, self._pos) + return ret + def decode_latin_1(s): if len(s) == 0: return s diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py --- a/rpython/rlib/test/test_rutf8.py +++ b/rpython/rlib/test/test_rutf8.py @@ -139,3 +139,39 @@ result = rutf8.surrogate_in_utf8(uni) expected = any(uch for uch in unichars if u'\ud800' <= uch <= u'\udfff') assert result == expected + +@given(strategies.text()) +def test_get_utf8_length_flag(u): + exp_lgt = len(u) + exp_flag = rutf8.FLAG_ASCII + for c in u: + if ord(c) > 0x7F: + exp_flag = rutf8.FLAG_REGULAR + lgt, flag = rutf8.get_utf8_length_flag(u.encode('utf8')) + assert lgt == exp_lgt + assert flag == exp_flag + +def test_utf8_string_builder(): + s = rutf8.Utf8StringBuilder() + s.append("foo") + s.append_char("x") + assert s.get_flag() == rutf8.FLAG_ASCII + assert s.get_length() == 4 + assert s.build() == "foox" + s.append(u"\u1234".encode("utf8")) + assert s.get_flag() == rutf8.FLAG_REGULAR + assert s.get_length() == 5 + assert s.build().decode("utf8") == u"foox\u1234" + s.append("foo") + s.append_char("x") + assert s.get_flag() == rutf8.FLAG_REGULAR + assert s.get_length() == 9 + assert s.build().decode("utf8") == u"foox\u1234foox" + s = rutf8.Utf8StringBuilder() + s.append_code(0x1234) + assert s.build().decode("utf8") == u"\u1234" + assert s.get_flag() == rutf8.FLAG_REGULAR + assert s.get_length() == 1 + s.append_code(0xD800) + assert s.get_flag() == rutf8.FLAG_HAS_SURROGATES + assert s.get_length() == 2 _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit