Author: Armin Rigo <[email protected]>
Branch: unicode-utf8
Changeset: r93284:6d7f2e710bd2
Date: 2017-12-06 11:42 +0100
http://bitbucket.org/pypy/pypy/changeset/6d7f2e710bd2/
Log: Inline the ascii part of unichr_as_utf8_append()
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -17,7 +17,7 @@
import sys
from rpython.rlib.objectmodel import enforceargs, we_are_translated, specialize
-from rpython.rlib.objectmodel import always_inline
+from rpython.rlib.objectmodel import always_inline, dont_inline
from rpython.rlib.rstring import StringBuilder
from rpython.rlib import jit
from rpython.rlib.signature import signature
@@ -50,6 +50,7 @@
chr((0x80 | (code & 0x3f))))
raise ValueError
+@always_inline
def unichr_as_utf8_append(builder, code, allow_surrogates=False):
"""Encode code (numeric value) as utf8 encoded string
and emit the result into the given StringBuilder.
@@ -59,13 +60,40 @@
if code <= r_uint(0x7F):
# Encode ASCII
builder.append(chr(code))
- return
+ else:
+ # Encode non-ASCII, uses a function call
+ if allow_surrogates:
+ _nonascii_unichr_as_utf8_append(builder, code)
+ else:
+ _nonascii_unichr_as_utf8_append_nosurrogates(builder, code)
+
+@dont_inline
+def _nonascii_unichr_as_utf8_append(builder, code):
if code <= r_uint(0x07FF):
builder.append(chr((0xc0 | (code >> 6))))
builder.append(chr((0x80 | (code & 0x3f))))
return
if code <= r_uint(0xFFFF):
- if not allow_surrogates and 0xd800 <= code <= 0xdfff:
+ builder.append(chr((0xe0 | (code >> 12))))
+ builder.append(chr((0x80 | ((code >> 6) & 0x3f))))
+ builder.append(chr((0x80 | (code & 0x3f))))
+ return
+ if code <= r_uint(0x10FFFF):
+ builder.append(chr((0xf0 | (code >> 18))))
+ builder.append(chr((0x80 | ((code >> 12) & 0x3f))))
+ builder.append(chr((0x80 | ((code >> 6) & 0x3f))))
+ builder.append(chr((0x80 | (code & 0x3f))))
+ return
+ raise ValueError
+
+@dont_inline
+def _nonascii_unichr_as_utf8_append_nosurrogates(builder, code):
+ if code <= r_uint(0x07FF):
+ builder.append(chr((0xc0 | (code >> 6))))
+ builder.append(chr((0x80 | (code & 0x3f))))
+ return
+ if code <= r_uint(0xFFFF):
+ if 0xd800 <= code <= 0xdfff:
raise ValueError
builder.append(chr((0xe0 | (code >> 12))))
builder.append(chr((0x80 | ((code >> 6) & 0x3f))))
@@ -79,6 +107,7 @@
return
raise ValueError
+
# note - table lookups are really slow. Measured on various elements of obama
# chinese wikipedia, they're anywhere between 10% and 30% slower.
# In extreme cases (small, only chinese text), they're 40% slower
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit