[issue14624] Faster utf-16 decoder

Serhiy Storchaka Thu, 03 May 2012 06:24:08 -0700

Serhiy Storchaka <storch...@gmail.com> added the comment:

Here is updated patch, taking into account that unicode_widen is already
optimized.


----------
Added file: http://bugs.python.org/file25443/decode_utf16_2.patch

_______________________________________
Python tracker <rep...@bugs.python.org>
<http://bugs.python.org/issue14624>
_______________________________________

diff -r 0a9143d7b097 Objects/stringlib/asciilib.h
--- a/Objects/stringlib/asciilib.h      Thu May 03 13:43:07 2012 +0200
+++ b/Objects/stringlib/asciilib.h      Thu May 03 15:50:11 2012 +0300
@@ -7,6 +7,7 @@
 #define STRINGLIB(F)             asciilib_##F
 #define STRINGLIB_OBJECT         PyUnicodeObject
 #define STRINGLIB_SIZEOF_CHAR    1
+#define STRINGLIB_MAX_CHAR       0x7Fu
 #define STRINGLIB_CHAR           Py_UCS1
 #define STRINGLIB_TYPE_NAME      "unicode"
 #define STRINGLIB_PARSE_CODE     "U"
diff -r 0a9143d7b097 Objects/stringlib/codecs.h
--- a/Objects/stringlib/codecs.h        Thu May 03 13:43:07 2012 +0200
+++ b/Objects/stringlib/codecs.h        Thu May 03 15:50:11 2012 +0300
@@ -150,7 +150,6 @@
     return ret;
 }
 
-#undef LONG_PTR_MASK
 #undef ASCII_CHAR_MASK
 
 
@@ -350,4 +349,153 @@
 #undef MAX_SHORT_UNICHARS
 }
 
+#define UCS2_REPEAT_MASK        (~0ul / 0xFFFFul)
+
+/* The mask for fast checking of whether a C 'long' may contain
+   UTF16-encoded surrogate characters. This is an efficient heuristic,
+   assuming that non-surrogate characters with a code point >= 0x8000 are
+   rare in most input.
+*/
+#if STRINGLIB_SIZEOF_CHAR == 1
+# define FAST_CHAR_MASK         (UCS2_REPEAT_MASK * (0xFFFFu & 
~STRINGLIB_MAX_CHAR))
+#else
+# define FAST_CHAR_MASK         (UCS2_REPEAT_MASK * 0x8000u)
+#endif
+/* The mask for fast byteswapping. */
+#define STRIPPED_MASK           (UCS2_REPEAT_MASK * 0x00FFu)
+/* Swap bytes. */
+#define SWAB(value)             ((((value) >> 8) & STRIPPED_MASK) | \
+                                 (((value) & STRIPPED_MASK) << 8))
+
+Py_LOCAL_INLINE(Py_UCS4)
+STRINGLIB(utf16_try_decode)(STRINGLIB_CHAR *dest, Py_ssize_t *outpos,
+                            const unsigned char **inptr,
+                            const unsigned char *e,
+                            int native_ordering)
+{
+    const unsigned char *aligned_end =
+            (const unsigned char *) ((size_t) (e + 1) & ~LONG_PTR_MASK);
+    const unsigned char *q = *inptr;
+    STRINGLIB_CHAR *p = dest + *outpos;
+    /* Offsets from q for retrieving byte pairs in the right order. */
+#ifdef BYTEORDER_IS_LITTLE_ENDIAN
+    int ihi = !!native_ordering, ilo = !native_ordering;
+#else
+    int ihi = !native_ordering, ilo = !!native_ordering;
+#endif
+
+    while (q < e) {
+        Py_UCS4 ch;
+        /* First check for possible aligned read of a C 'long'. Unaligned
+           reads are more expensive, better to defer to another iteration. */
+        if (!((size_t) q & LONG_PTR_MASK)) {
+            /* Fast path for runs of non-surrogate chars. */
+            register const unsigned char *_q = q;
+            while (_q < aligned_end) {
+                unsigned long block = * (unsigned long *) _q;
+                /* Fast checking of whether a C 'long' may contain
+                   UTF16-encoded surrogate characters. This is an efficient
+                   heuristic, assuming that non-surrogate characters with
+                   a code point >= 0x8000 are rare in most input.
+                */
+                if (native_ordering) {
+                    /* Can use buffer directly */
+                    if (block & FAST_CHAR_MASK)
+                        break;
+                }
+                else {
+                    /* Need to byte-swap */
+                    if (block & SWAB(FAST_CHAR_MASK))
+                        break;
+#if STRINGLIB_SIZEOF_CHAR == 1
+                    block >>= 8;
+#else
+                    block = SWAB(block);
+#endif
+                }
+#ifdef BYTEORDER_IS_LITTLE_ENDIAN
+#if SIZEOF_LONG == 4
+                *(p + 0) = (STRINGLIB_CHAR)(block & 0xFFFFu);
+                *(p + 1) = (STRINGLIB_CHAR)(block >> 16);
+#endif
+#if SIZEOF_LONG == 8
+                *(p + 0) = (STRINGLIB_CHAR)(block & 0xFFFFu);
+                *(p + 1) = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
+                *(p + 2) = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
+                *(p + 3) = (STRINGLIB_CHAR)(block >> 48);
+#endif
+#else
+#if SIZEOF_LONG == 4
+                *(p + 0) = (STRINGLIB_CHAR)(block >> 16);
+                *(p + 1) = (STRINGLIB_CHAR)(block & 0xFFFFu);
+#endif
+#if SIZEOF_LONG == 8
+                *(p + 0) = (STRINGLIB_CHAR)(block >> 48);
+                *(p + 1) = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
+                *(p + 2) = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
+                *(p + 3) = (STRINGLIB_CHAR)(block & 0xFFFFu);
+#endif
+#endif
+                _q += SIZEOF_LONG;
+                p += SIZEOF_LONG / 2;
+            }
+            q = _q;
+            if (q >= e)
+                break;
+        }
+        ch = (q[ihi] << 8) | q[ilo];
+        q += 2;
+#if STRINGLIB_SIZEOF_CHAR == 1
+        if (ch <= STRINGLIB_MAX_CHAR) {
+            *p++ = (STRINGLIB_CHAR)ch;
+            continue;
+        }
+#endif
+        if (!Py_UNICODE_IS_SURROGATE(ch)) {
+#if STRINGLIB_SIZEOF_CHAR >= 2
+            *p++ = (STRINGLIB_CHAR)ch;
+            continue;
+#else
+            *inptr = q;
+            *outpos = p - dest;
+            return ch;
+#endif
+        }
+        /* UTF-16 code pair: */
+        if (q < e) {
+            if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
+                Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo];
+                q += 2;
+                if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) {
+                    ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2);
+#if STRINGLIB_SIZEOF_CHAR == 4
+                    *p++ = (STRINGLIB_CHAR)ch;
+                    continue;
+#else
+                    *inptr = q;
+                    *outpos = p - dest;
+                    return ch;
+#endif
+                }
+                *inptr = q;
+                *outpos = p - dest;
+                return 3; /* illegal UTF-16 surrogate */
+            }
+            *inptr = q;
+            *outpos = p - dest;
+            return 2; /* illegal encoding */
+        }
+        *inptr = q;
+        *outpos = p - dest;
+        return 1; /* unexpected end of data */
+    }
+    *inptr = q;
+    *outpos = p - dest;
+    return 0;
+}
+#undef UCS2_REPEAT_MASK
+#undef FAST_CHAR_MASK
+#undef STRIPPED_MASK
+#undef SWAB
+#undef LONG_PTR_MASK
 #endif /* STRINGLIB_IS_UNICODE */
diff -r 0a9143d7b097 Objects/stringlib/ucs1lib.h
--- a/Objects/stringlib/ucs1lib.h       Thu May 03 13:43:07 2012 +0200
+++ b/Objects/stringlib/ucs1lib.h       Thu May 03 15:50:11 2012 +0300
@@ -7,6 +7,7 @@
 #define STRINGLIB(F)             ucs1lib_##F
 #define STRINGLIB_OBJECT         PyUnicodeObject
 #define STRINGLIB_SIZEOF_CHAR    1
+#define STRINGLIB_MAX_CHAR       0xFFu
 #define STRINGLIB_CHAR           Py_UCS1
 #define STRINGLIB_TYPE_NAME      "unicode"
 #define STRINGLIB_PARSE_CODE     "U"
diff -r 0a9143d7b097 Objects/stringlib/ucs2lib.h
--- a/Objects/stringlib/ucs2lib.h       Thu May 03 13:43:07 2012 +0200
+++ b/Objects/stringlib/ucs2lib.h       Thu May 03 15:50:11 2012 +0300
@@ -7,6 +7,7 @@
 #define STRINGLIB(F)             ucs2lib_##F
 #define STRINGLIB_OBJECT         PyUnicodeObject
 #define STRINGLIB_SIZEOF_CHAR    2
+#define STRINGLIB_MAX_CHAR       0xFFFFu
 #define STRINGLIB_CHAR           Py_UCS2
 #define STRINGLIB_TYPE_NAME      "unicode"
 #define STRINGLIB_PARSE_CODE     "U"
diff -r 0a9143d7b097 Objects/stringlib/ucs4lib.h
--- a/Objects/stringlib/ucs4lib.h       Thu May 03 13:43:07 2012 +0200
+++ b/Objects/stringlib/ucs4lib.h       Thu May 03 15:50:11 2012 +0300
@@ -7,6 +7,7 @@
 #define STRINGLIB(F)             ucs4lib_##F
 #define STRINGLIB_OBJECT         PyUnicodeObject
 #define STRINGLIB_SIZEOF_CHAR    4
+#define STRINGLIB_MAX_CHAR       0x10FFFFu
 #define STRINGLIB_CHAR           Py_UCS4
 #define STRINGLIB_TYPE_NAME      "unicode"
 #define STRINGLIB_PARSE_CODE     "U"
diff -r 0a9143d7b097 Objects/stringlib/undef.h
--- a/Objects/stringlib/undef.h Thu May 03 13:43:07 2012 +0200
+++ b/Objects/stringlib/undef.h Thu May 03 15:50:11 2012 +0300
@@ -1,6 +1,7 @@
 #undef  FASTSEARCH
 #undef  STRINGLIB
 #undef  STRINGLIB_SIZEOF_CHAR
+#undef  STRINGLIB_MAX_CHAR
 #undef  STRINGLIB_CHAR
 #undef  STRINGLIB_STR
 #undef  STRINGLIB_LEN
diff -r 0a9143d7b097 Objects/unicodeobject.c
--- a/Objects/unicodeobject.c   Thu May 03 13:43:07 2012 +0200
+++ b/Objects/unicodeobject.c   Thu May 03 15:50:11 2012 +0300
@@ -4644,6 +4644,10 @@
     return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
 }
 
+#include "stringlib/asciilib.h"
+#include "stringlib/codecs.h"
+#include "stringlib/undef.h"
+
 #include "stringlib/ucs1lib.h"
 #include "stringlib/codecs.h"
 #include "stringlib/undef.h"
@@ -5472,25 +5476,6 @@
     return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
 }
 
-/* Two masks for fast checking of whether a C 'long' may contain
-   UTF16-encoded surrogate characters. This is an efficient heuristic,
-   assuming that non-surrogate characters with a code point >= 0x8000 are
-   rare in most input.
-   FAST_CHAR_MASK is used when the input is in native byte ordering,
-   SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
-*/
-#if (SIZEOF_LONG == 8)
-# define FAST_CHAR_MASK         0x8000800080008000L
-# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
-# define STRIPPED_MASK          0x00FF00FF00FF00FFL
-#elif (SIZEOF_LONG == 4)
-# define FAST_CHAR_MASK         0x80008000L
-# define SWAPPED_FAST_CHAR_MASK 0x00800080L
-# define STRIPPED_MASK          0x00FF00FFL
-#else
-# error C 'long' size should be either 4 or 8!
-#endif
-
 PyObject *
 PyUnicode_DecodeUTF16Stateful(const char *s,
                               Py_ssize_t size,
@@ -5503,30 +5488,22 @@
     Py_ssize_t endinpos;
     Py_ssize_t outpos;
     PyObject *unicode;
-    const unsigned char *q, *e, *aligned_end;
+    const unsigned char *q, *e;
     int bo = 0;       /* assume native ordering by default */
-    int native_ordering = 0;
+    int native_ordering;
     const char *errmsg = "";
-    /* Offsets from q for retrieving byte pairs in the right order. */
-#ifdef BYTEORDER_IS_LITTLE_ENDIAN
-    int ihi = 1, ilo = 0;
-#else
-    int ihi = 0, ilo = 1;
-#endif
     PyObject *errorHandler = NULL;
     PyObject *exc = NULL;
 
-    /* Note: size will always be longer than the resulting Unicode
-       character count */
-    unicode = PyUnicode_New(size, 127);
-    if (!unicode)
-        return NULL;
-    if (size == 0)
-        return unicode;
-    outpos = 0;
+    if (size == 0) {
+        if (consumed)
+            *consumed = 0;
+        Py_INCREF(unicode_empty);
+        return unicode_empty;
+    }
 
     q = (unsigned char *)s;
-    e = q + size - 1;
+    e = q + size;
 
     if (byteorder)
         bo = *byteorder;
@@ -5537,8 +5514,7 @@
        stream as-is (giving a ZWNBSP character). */
     if (bo == 0) {
         if (size >= 2) {
-            const Py_UCS4 bom = (q[ihi] << 8) | q[ilo];
-#ifdef BYTEORDER_IS_LITTLE_ENDIAN
+            const Py_UCS4 bom = (q[1] << 8) | q[0];
             if (bom == 0xFEFF) {
                 q += 2;
                 bo = -1;
@@ -5547,143 +5523,80 @@
                 q += 2;
                 bo = 1;
             }
+        }
+    }
+
+#ifdef BYTEORDER_IS_LITTLE_ENDIAN
+    native_ordering = bo <= 0;
 #else
-            if (bom == 0xFEFF) {
-                q += 2;
-                bo = 1;
-            }
-            else if (bom == 0xFFFE) {
-                q += 2;
-                bo = -1;
-            }
-#endif
-        }
-    }
-
-    if (bo == -1) {
-        /* force LE */
-        ihi = 1;
-        ilo = 0;
-    }
-    else if (bo == 1) {
-        /* force BE */
-        ihi = 0;
-        ilo = 1;
-    }
-#ifdef BYTEORDER_IS_LITTLE_ENDIAN
-    native_ordering = ilo < ihi;
-#else
-    native_ordering = ilo > ihi;
-#endif
-
-    aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
-    while (q < e) {
-        Py_UCS4 ch;
-        /* First check for possible aligned read of a C 'long'. Unaligned
-           reads are more expensive, better to defer to another iteration. */
-        if (!((size_t) q & LONG_PTR_MASK)) {
-            /* Fast path for runs of non-surrogate chars. */
-            register const unsigned char *_q = q;
+    native_ordering = bo >= 0;
+#endif
+
+    /* Note: size will always be longer than the resulting Unicode
+       character count */
+    unicode = PyUnicode_New(size, 127);
+    if (!unicode)
+        return NULL;
+    outpos = 0;
+
+    while (1) {
+        Py_UCS4 ch = 0;
+        if (e - q > 1) {
+            const unsigned char *e2 = e - 1;
             int kind = PyUnicode_KIND(unicode);
-            void *data = PyUnicode_DATA(unicode);
-            while (_q < aligned_end) {
-                unsigned long block = * (unsigned long *) _q;
-                Py_UCS4 maxch;
-                if (native_ordering) {
-                    /* Can use buffer directly */
-                    if (block & FAST_CHAR_MASK)
-                        break;
-                }
-                else {
-                    /* Need to byte-swap */
-                    if (block & SWAPPED_FAST_CHAR_MASK)
-                        break;
-                    block = ((block >> 8) & STRIPPED_MASK) |
-                            ((block & STRIPPED_MASK) << 8);
-                }
-                maxch = (Py_UCS2)(block & 0xFFFF);
-#if SIZEOF_LONG == 8
-                ch = (Py_UCS2)((block >> 16) & 0xFFFF);
-                maxch = MAX_MAXCHAR(maxch, ch);
-                ch = (Py_UCS2)((block >> 32) & 0xFFFF);
-                maxch = MAX_MAXCHAR(maxch, ch);
-                ch = (Py_UCS2)(block >> 48);
-                maxch = MAX_MAXCHAR(maxch, ch);
-#else
-                ch = (Py_UCS2)(block >> 16);
-                maxch = MAX_MAXCHAR(maxch, ch);
-#endif
-                if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
-                    if (unicode_widen(&unicode, outpos, maxch) < 0)
-                        goto onError;
-                    kind = PyUnicode_KIND(unicode);
-                    data = PyUnicode_DATA(unicode);
-                }
-#ifdef BYTEORDER_IS_LITTLE_ENDIAN
-                PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block & 
0xFFFF));
-#if SIZEOF_LONG == 8
-                PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 16) 
& 0xFFFF));
-                PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 32) 
& 0xFFFF));
-                PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 
48)));
-#else
-                PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block >> 16));
-#endif
-#else
-#if SIZEOF_LONG == 8
-                PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 
48)));
-                PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 32) 
& 0xFFFF));
-                PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 16) 
& 0xFFFF));
-#else
-                PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block >> 16));
-#endif
-                PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block & 
0xFFFF));
-#endif
-                _q += SIZEOF_LONG;
-            }
-            q = _q;
-            if (q >= e)
-                break;
-        }
-        ch = (q[ihi] << 8) | q[ilo];
-
-        q += 2;
-
-        if (!Py_UNICODE_IS_SURROGATE(ch)) {
+            if (kind == PyUnicode_1BYTE_KIND) {
+                if (PyUnicode_IS_ASCII(unicode))
+                    ch = asciilib_utf16_try_decode(
+                            PyUnicode_1BYTE_DATA(unicode), &outpos,
+                            &q, e2, native_ordering);
+                else
+                    ch = ucs1lib_utf16_try_decode(
+                            PyUnicode_1BYTE_DATA(unicode), &outpos,
+                            &q, e2, native_ordering);
+            } else if (kind == PyUnicode_2BYTE_KIND) {
+                ch = ucs2lib_utf16_try_decode(
+                        PyUnicode_2BYTE_DATA(unicode), &outpos,
+                        &q, e2, native_ordering);
+            } else {
+                assert(kind == PyUnicode_4BYTE_KIND);
+                ch = ucs4lib_utf16_try_decode(
+                        PyUnicode_4BYTE_DATA(unicode), &outpos,
+                        &q, e2, native_ordering);
+            }
+        }
+        switch (ch)
+        {
+        case 0:
+            /* remaining byte at the end? (size should be even) */
+            if (q == e || consumed)
+                goto End;
+            errmsg = "truncated data";
+            startinpos = ((const char *)q) - starts;
+            endinpos = ((const char *)e) - starts;
+            break;
+            /* The remaining input chars are ignored if the callback
+               chooses to skip the input */
+        case 1:
+            errmsg = "unexpected end of data";
+            startinpos = ((const char *)q) - 2 - starts;
+            endinpos = ((const char *)e) - starts;
+            break;
+        case 2:
+            errmsg = "illegal encoding";
+            startinpos = ((const char *)q) - 2 - starts;
+            endinpos = startinpos + 2;
+            break;
+        case 3:
+            errmsg = "illegal UTF-16 surrogate";
+            startinpos = ((const char *)q) - 4 - starts;
+            endinpos = startinpos + 2;
+            break;
+        default:
             if (unicode_putchar(&unicode, &outpos, ch) < 0)
                 goto onError;
             continue;
         }
 
-        /* UTF-16 code pair: */
-        if (q > e) {
-            errmsg = "unexpected end of data";
-            startinpos = (((const char *)q) - 2) - starts;
-            endinpos = ((const char *)e) + 1 - starts;
-            goto utf16Error;
-        }
-        if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
-            Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo];
-            q += 2;
-            if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) {
-                if (unicode_putchar(&unicode, &outpos,
-                                    Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0)
-                    goto onError;
-                continue;
-            }
-            else {
-                errmsg = "illegal UTF-16 surrogate";
-                startinpos = (((const char *)q)-4)-starts;
-                endinpos = startinpos+2;
-                goto utf16Error;
-            }
-
-        }
-        errmsg = "illegal encoding";
-        startinpos = (((const char *)q)-2)-starts;
-        endinpos = startinpos+2;
-        /* Fall through to report the error */
-
-      utf16Error:
         if (unicode_decode_call_errorhandler(
                 errors,
                 &errorHandler,
@@ -5698,30 +5611,8 @@
                 &outpos))
             goto onError;
     }
-    /* remaining byte at the end? (size should be even) */
-    if (e == q) {
-        if (!consumed) {
-            errmsg = "truncated data";
-            startinpos = ((const char *)q) - starts;
-            endinpos = ((const char *)e) + 1 - starts;
-            if (unicode_decode_call_errorhandler(
-                    errors,
-                    &errorHandler,
-                    "utf16", errmsg,
-                    &starts,
-                    (const char **)&e,
-                    &startinpos,
-                    &endinpos,
-                    &exc,
-                    (const char **)&q,
-                    &unicode,
-                    &outpos))
-                goto onError;
-            /* The remaining input chars are ignored if the callback
-               chooses to skip the input */
-        }
-    }
-
+
+End:
     if (byteorder)
         *byteorder = bo;
 
@@ -5743,9 +5634,6 @@
     return NULL;
 }
 
-#undef FAST_CHAR_MASK
-#undef SWAPPED_FAST_CHAR_MASK
-
 PyObject *
 _PyUnicode_EncodeUTF16(PyObject *str,
                        const char *errors,

_______________________________________________
Python-bugs-list mailing list
Unsubscribe: 
http://mail.python.org/mailman/options/python-bugs-list/archive%40mail-archive.com

[issue14624] Faster utf-16 decoder

Reply via email to