Author: Armin Rigo <ar...@tunes.org> Branch: unicode-utf8 Changeset: r93282:cc3f32cc59be Date: 2017-12-06 11:18 +0100 http://bitbucket.org/pypy/pypy/changeset/cc3f32cc59be/
Log: This reduces the overhead from 50% to 35% diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py --- a/rpython/rlib/rutf8.py +++ b/rpython/rlib/rutf8.py @@ -734,11 +734,43 @@ @always_inline def next(self): - if self._pos == self._end: + pos = self._pos + if pos == self._end: raise StopIteration - ret = codepoint_at_pos(self._utf8, self._pos) - self._pos = next_codepoint_pos(self._utf8, self._pos) - return ret + #----- sane-looking version: ------ + #ret = codepoint_at_pos(self._utf8, self._pos) + #self._pos = next_codepoint_pos(self._utf8, self._pos) + #return ret + #----- manually inlined version follows, with merged checks ----- + + code = self._utf8 + ordch1 = ord(code[pos]) + if ordch1 <= 0x7F: + self._pos = pos + 1 + return ordch1 + + ordch2 = ord(code[pos+1]) + if ordch1 <= 0xDF: + # 110yyyyy 10zzzzzz -> 00000000 00000yyy yyzzzzzz + self._pos = pos + 2 + return (ordch1 << 6) + ordch2 - ( + (0xC0 << 6) + 0x80 ) + + ordch3 = ord(code[pos+2]) + if ordch1 <= 0xEF: + # 1110xxxx 10yyyyyy 10zzzzzz -> 00000000 xxxxyyyy yyzzzzzz + self._pos = pos + 3 + return (ordch1 << 12) + (ordch2 << 6) + ordch3 - ( + (0xE0 << 12) + (0x80 << 6) + 0x80 ) + + ordch4 = ord(code[pos+3]) + if True: + # 11110www 10xxxxxx 10yyyyyy 10zzzzzz -> 000wwwxx xxxxyyyy yyzzzzzz + self._pos = pos + 4 + return (ordch1 << 18) + (ordch2 << 12) + (ordch3 << 6) + ordch4 - ( + (0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 ) + assert False, "unreachable" + def decode_latin_1(s): if len(s) == 0: _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit