[pypy-commit] pypy unicode-utf8: Inline the ascii part of unichr_as_utf8_append()

arigo Wed, 06 Dec 2017 02:44:39 -0800

Author: Armin Rigo <ar...@tunes.org>
Branch: unicode-utf8
Changeset: r93284:6d7f2e710bd2
Date: 2017-12-06 11:42 +0100
http://bitbucket.org/pypy/pypy/changeset/6d7f2e710bd2/


Log:    Inline the ascii part of unichr_as_utf8_append()

diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -17,7 +17,7 @@
 
 import sys
 from rpython.rlib.objectmodel import enforceargs, we_are_translated, specialize
-from rpython.rlib.objectmodel import always_inline
+from rpython.rlib.objectmodel import always_inline, dont_inline
 from rpython.rlib.rstring import StringBuilder
 from rpython.rlib import jit
 from rpython.rlib.signature import signature
@@ -50,6 +50,7 @@
                 chr((0x80 | (code & 0x3f))))
     raise ValueError
 
+@always_inline
 def unichr_as_utf8_append(builder, code, allow_surrogates=False):
     """Encode code (numeric value) as utf8 encoded string
     and emit the result into the given StringBuilder.
@@ -59,13 +60,40 @@
     if code <= r_uint(0x7F):
         # Encode ASCII
         builder.append(chr(code))
-        return
+    else:
+        # Encode non-ASCII, uses a function call
+        if allow_surrogates:
+            _nonascii_unichr_as_utf8_append(builder, code)
+        else:
+            _nonascii_unichr_as_utf8_append_nosurrogates(builder, code)
+
+@dont_inline
+def _nonascii_unichr_as_utf8_append(builder, code):
     if code <= r_uint(0x07FF):
         builder.append(chr((0xc0 | (code >> 6))))
         builder.append(chr((0x80 | (code & 0x3f))))
         return
     if code <= r_uint(0xFFFF):
-        if not allow_surrogates and 0xd800 <= code <= 0xdfff:
+        builder.append(chr((0xe0 | (code >> 12))))
+        builder.append(chr((0x80 | ((code >> 6) & 0x3f))))
+        builder.append(chr((0x80 | (code & 0x3f))))
+        return
+    if code <= r_uint(0x10FFFF):
+        builder.append(chr((0xf0 | (code >> 18))))
+        builder.append(chr((0x80 | ((code >> 12) & 0x3f))))
+        builder.append(chr((0x80 | ((code >> 6) & 0x3f))))
+        builder.append(chr((0x80 | (code & 0x3f))))
+        return
+    raise ValueError
+
+@dont_inline
+def _nonascii_unichr_as_utf8_append_nosurrogates(builder, code):
+    if code <= r_uint(0x07FF):
+        builder.append(chr((0xc0 | (code >> 6))))
+        builder.append(chr((0x80 | (code & 0x3f))))
+        return
+    if code <= r_uint(0xFFFF):
+        if 0xd800 <= code <= 0xdfff:
             raise ValueError
         builder.append(chr((0xe0 | (code >> 12))))
         builder.append(chr((0x80 | ((code >> 6) & 0x3f))))
@@ -79,6 +107,7 @@
         return
     raise ValueError
 
+
 # note - table lookups are really slow. Measured on various elements of obama
 #        chinese wikipedia, they're anywhere between 10% and 30% slower.
 #        In extreme cases (small, only chinese text), they're 40% slower
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy unicode-utf8: Inline the ascii part of unichr_as_utf8_append()

Reply via email to