Serhiy Storchaka <storch...@gmail.com> added the comment:
Here is updated patch, taking into account that unicode_widen is already
optimized.
----------
Added file: http://bugs.python.org/file25443/decode_utf16_2.patch
_______________________________________
Python tracker <rep...@bugs.python.org>
<http://bugs.python.org/issue14624>
_______________________________________
diff -r 0a9143d7b097 Objects/stringlib/asciilib.h
--- a/Objects/stringlib/asciilib.h Thu May 03 13:43:07 2012 +0200
+++ b/Objects/stringlib/asciilib.h Thu May 03 15:50:11 2012 +0300
@@ -7,6 +7,7 @@
#define STRINGLIB(F) asciilib_##F
#define STRINGLIB_OBJECT PyUnicodeObject
#define STRINGLIB_SIZEOF_CHAR 1
+#define STRINGLIB_MAX_CHAR 0x7Fu
#define STRINGLIB_CHAR Py_UCS1
#define STRINGLIB_TYPE_NAME "unicode"
#define STRINGLIB_PARSE_CODE "U"
diff -r 0a9143d7b097 Objects/stringlib/codecs.h
--- a/Objects/stringlib/codecs.h Thu May 03 13:43:07 2012 +0200
+++ b/Objects/stringlib/codecs.h Thu May 03 15:50:11 2012 +0300
@@ -150,7 +150,6 @@
return ret;
}
-#undef LONG_PTR_MASK
#undef ASCII_CHAR_MASK
@@ -350,4 +349,153 @@
#undef MAX_SHORT_UNICHARS
}
+#define UCS2_REPEAT_MASK (~0ul / 0xFFFFul)
+
+/* The mask for fast checking of whether a C 'long' may contain
+ UTF16-encoded surrogate characters. This is an efficient heuristic,
+ assuming that non-surrogate characters with a code point >= 0x8000 are
+ rare in most input.
+*/
+#if STRINGLIB_SIZEOF_CHAR == 1
+# define FAST_CHAR_MASK (UCS2_REPEAT_MASK * (0xFFFFu &
~STRINGLIB_MAX_CHAR))
+#else
+# define FAST_CHAR_MASK (UCS2_REPEAT_MASK * 0x8000u)
+#endif
+/* The mask for fast byteswapping. */
+#define STRIPPED_MASK (UCS2_REPEAT_MASK * 0x00FFu)
+/* Swap bytes. */
+#define SWAB(value) ((((value) >> 8) & STRIPPED_MASK) | \
+ (((value) & STRIPPED_MASK) << 8))
+
+Py_LOCAL_INLINE(Py_UCS4)
+STRINGLIB(utf16_try_decode)(STRINGLIB_CHAR *dest, Py_ssize_t *outpos,
+ const unsigned char **inptr,
+ const unsigned char *e,
+ int native_ordering)
+{
+ const unsigned char *aligned_end =
+ (const unsigned char *) ((size_t) (e + 1) & ~LONG_PTR_MASK);
+ const unsigned char *q = *inptr;
+ STRINGLIB_CHAR *p = dest + *outpos;
+ /* Offsets from q for retrieving byte pairs in the right order. */
+#ifdef BYTEORDER_IS_LITTLE_ENDIAN
+ int ihi = !!native_ordering, ilo = !native_ordering;
+#else
+ int ihi = !native_ordering, ilo = !!native_ordering;
+#endif
+
+ while (q < e) {
+ Py_UCS4 ch;
+ /* First check for possible aligned read of a C 'long'. Unaligned
+ reads are more expensive, better to defer to another iteration. */
+ if (!((size_t) q & LONG_PTR_MASK)) {
+ /* Fast path for runs of non-surrogate chars. */
+ register const unsigned char *_q = q;
+ while (_q < aligned_end) {
+ unsigned long block = * (unsigned long *) _q;
+ /* Fast checking of whether a C 'long' may contain
+ UTF16-encoded surrogate characters. This is an efficient
+ heuristic, assuming that non-surrogate characters with
+ a code point >= 0x8000 are rare in most input.
+ */
+ if (native_ordering) {
+ /* Can use buffer directly */
+ if (block & FAST_CHAR_MASK)
+ break;
+ }
+ else {
+ /* Need to byte-swap */
+ if (block & SWAB(FAST_CHAR_MASK))
+ break;
+#if STRINGLIB_SIZEOF_CHAR == 1
+ block >>= 8;
+#else
+ block = SWAB(block);
+#endif
+ }
+#ifdef BYTEORDER_IS_LITTLE_ENDIAN
+#if SIZEOF_LONG == 4
+ *(p + 0) = (STRINGLIB_CHAR)(block & 0xFFFFu);
+ *(p + 1) = (STRINGLIB_CHAR)(block >> 16);
+#endif
+#if SIZEOF_LONG == 8
+ *(p + 0) = (STRINGLIB_CHAR)(block & 0xFFFFu);
+ *(p + 1) = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
+ *(p + 2) = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
+ *(p + 3) = (STRINGLIB_CHAR)(block >> 48);
+#endif
+#else
+#if SIZEOF_LONG == 4
+ *(p + 0) = (STRINGLIB_CHAR)(block >> 16);
+ *(p + 1) = (STRINGLIB_CHAR)(block & 0xFFFFu);
+#endif
+#if SIZEOF_LONG == 8
+ *(p + 0) = (STRINGLIB_CHAR)(block >> 48);
+ *(p + 1) = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
+ *(p + 2) = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
+ *(p + 3) = (STRINGLIB_CHAR)(block & 0xFFFFu);
+#endif
+#endif
+ _q += SIZEOF_LONG;
+ p += SIZEOF_LONG / 2;
+ }
+ q = _q;
+ if (q >= e)
+ break;
+ }
+ ch = (q[ihi] << 8) | q[ilo];
+ q += 2;
+#if STRINGLIB_SIZEOF_CHAR == 1
+ if (ch <= STRINGLIB_MAX_CHAR) {
+ *p++ = (STRINGLIB_CHAR)ch;
+ continue;
+ }
+#endif
+ if (!Py_UNICODE_IS_SURROGATE(ch)) {
+#if STRINGLIB_SIZEOF_CHAR >= 2
+ *p++ = (STRINGLIB_CHAR)ch;
+ continue;
+#else
+ *inptr = q;
+ *outpos = p - dest;
+ return ch;
+#endif
+ }
+ /* UTF-16 code pair: */
+ if (q < e) {
+ if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
+ Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo];
+ q += 2;
+ if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) {
+ ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2);
+#if STRINGLIB_SIZEOF_CHAR == 4
+ *p++ = (STRINGLIB_CHAR)ch;
+ continue;
+#else
+ *inptr = q;
+ *outpos = p - dest;
+ return ch;
+#endif
+ }
+ *inptr = q;
+ *outpos = p - dest;
+ return 3; /* illegal UTF-16 surrogate */
+ }
+ *inptr = q;
+ *outpos = p - dest;
+ return 2; /* illegal encoding */
+ }
+ *inptr = q;
+ *outpos = p - dest;
+ return 1; /* unexpected end of data */
+ }
+ *inptr = q;
+ *outpos = p - dest;
+ return 0;
+}
+#undef UCS2_REPEAT_MASK
+#undef FAST_CHAR_MASK
+#undef STRIPPED_MASK
+#undef SWAB
+#undef LONG_PTR_MASK
#endif /* STRINGLIB_IS_UNICODE */
diff -r 0a9143d7b097 Objects/stringlib/ucs1lib.h
--- a/Objects/stringlib/ucs1lib.h Thu May 03 13:43:07 2012 +0200
+++ b/Objects/stringlib/ucs1lib.h Thu May 03 15:50:11 2012 +0300
@@ -7,6 +7,7 @@
#define STRINGLIB(F) ucs1lib_##F
#define STRINGLIB_OBJECT PyUnicodeObject
#define STRINGLIB_SIZEOF_CHAR 1
+#define STRINGLIB_MAX_CHAR 0xFFu
#define STRINGLIB_CHAR Py_UCS1
#define STRINGLIB_TYPE_NAME "unicode"
#define STRINGLIB_PARSE_CODE "U"
diff -r 0a9143d7b097 Objects/stringlib/ucs2lib.h
--- a/Objects/stringlib/ucs2lib.h Thu May 03 13:43:07 2012 +0200
+++ b/Objects/stringlib/ucs2lib.h Thu May 03 15:50:11 2012 +0300
@@ -7,6 +7,7 @@
#define STRINGLIB(F) ucs2lib_##F
#define STRINGLIB_OBJECT PyUnicodeObject
#define STRINGLIB_SIZEOF_CHAR 2
+#define STRINGLIB_MAX_CHAR 0xFFFFu
#define STRINGLIB_CHAR Py_UCS2
#define STRINGLIB_TYPE_NAME "unicode"
#define STRINGLIB_PARSE_CODE "U"
diff -r 0a9143d7b097 Objects/stringlib/ucs4lib.h
--- a/Objects/stringlib/ucs4lib.h Thu May 03 13:43:07 2012 +0200
+++ b/Objects/stringlib/ucs4lib.h Thu May 03 15:50:11 2012 +0300
@@ -7,6 +7,7 @@
#define STRINGLIB(F) ucs4lib_##F
#define STRINGLIB_OBJECT PyUnicodeObject
#define STRINGLIB_SIZEOF_CHAR 4
+#define STRINGLIB_MAX_CHAR 0x10FFFFu
#define STRINGLIB_CHAR Py_UCS4
#define STRINGLIB_TYPE_NAME "unicode"
#define STRINGLIB_PARSE_CODE "U"
diff -r 0a9143d7b097 Objects/stringlib/undef.h
--- a/Objects/stringlib/undef.h Thu May 03 13:43:07 2012 +0200
+++ b/Objects/stringlib/undef.h Thu May 03 15:50:11 2012 +0300
@@ -1,6 +1,7 @@
#undef FASTSEARCH
#undef STRINGLIB
#undef STRINGLIB_SIZEOF_CHAR
+#undef STRINGLIB_MAX_CHAR
#undef STRINGLIB_CHAR
#undef STRINGLIB_STR
#undef STRINGLIB_LEN
diff -r 0a9143d7b097 Objects/unicodeobject.c
--- a/Objects/unicodeobject.c Thu May 03 13:43:07 2012 +0200
+++ b/Objects/unicodeobject.c Thu May 03 15:50:11 2012 +0300
@@ -4644,6 +4644,10 @@
return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
}
+#include "stringlib/asciilib.h"
+#include "stringlib/codecs.h"
+#include "stringlib/undef.h"
+
#include "stringlib/ucs1lib.h"
#include "stringlib/codecs.h"
#include "stringlib/undef.h"
@@ -5472,25 +5476,6 @@
return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
}
-/* Two masks for fast checking of whether a C 'long' may contain
- UTF16-encoded surrogate characters. This is an efficient heuristic,
- assuming that non-surrogate characters with a code point >= 0x8000 are
- rare in most input.
- FAST_CHAR_MASK is used when the input is in native byte ordering,
- SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
-*/
-#if (SIZEOF_LONG == 8)
-# define FAST_CHAR_MASK 0x8000800080008000L
-# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
-# define STRIPPED_MASK 0x00FF00FF00FF00FFL
-#elif (SIZEOF_LONG == 4)
-# define FAST_CHAR_MASK 0x80008000L
-# define SWAPPED_FAST_CHAR_MASK 0x00800080L
-# define STRIPPED_MASK 0x00FF00FFL
-#else
-# error C 'long' size should be either 4 or 8!
-#endif
-
PyObject *
PyUnicode_DecodeUTF16Stateful(const char *s,
Py_ssize_t size,
@@ -5503,30 +5488,22 @@
Py_ssize_t endinpos;
Py_ssize_t outpos;
PyObject *unicode;
- const unsigned char *q, *e, *aligned_end;
+ const unsigned char *q, *e;
int bo = 0; /* assume native ordering by default */
- int native_ordering = 0;
+ int native_ordering;
const char *errmsg = "";
- /* Offsets from q for retrieving byte pairs in the right order. */
-#ifdef BYTEORDER_IS_LITTLE_ENDIAN
- int ihi = 1, ilo = 0;
-#else
- int ihi = 0, ilo = 1;
-#endif
PyObject *errorHandler = NULL;
PyObject *exc = NULL;
- /* Note: size will always be longer than the resulting Unicode
- character count */
- unicode = PyUnicode_New(size, 127);
- if (!unicode)
- return NULL;
- if (size == 0)
- return unicode;
- outpos = 0;
+ if (size == 0) {
+ if (consumed)
+ *consumed = 0;
+ Py_INCREF(unicode_empty);
+ return unicode_empty;
+ }
q = (unsigned char *)s;
- e = q + size - 1;
+ e = q + size;
if (byteorder)
bo = *byteorder;
@@ -5537,8 +5514,7 @@
stream as-is (giving a ZWNBSP character). */
if (bo == 0) {
if (size >= 2) {
- const Py_UCS4 bom = (q[ihi] << 8) | q[ilo];
-#ifdef BYTEORDER_IS_LITTLE_ENDIAN
+ const Py_UCS4 bom = (q[1] << 8) | q[0];
if (bom == 0xFEFF) {
q += 2;
bo = -1;
@@ -5547,143 +5523,80 @@
q += 2;
bo = 1;
}
+ }
+ }
+
+#ifdef BYTEORDER_IS_LITTLE_ENDIAN
+ native_ordering = bo <= 0;
#else
- if (bom == 0xFEFF) {
- q += 2;
- bo = 1;
- }
- else if (bom == 0xFFFE) {
- q += 2;
- bo = -1;
- }
-#endif
- }
- }
-
- if (bo == -1) {
- /* force LE */
- ihi = 1;
- ilo = 0;
- }
- else if (bo == 1) {
- /* force BE */
- ihi = 0;
- ilo = 1;
- }
-#ifdef BYTEORDER_IS_LITTLE_ENDIAN
- native_ordering = ilo < ihi;
-#else
- native_ordering = ilo > ihi;
-#endif
-
- aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
- while (q < e) {
- Py_UCS4 ch;
- /* First check for possible aligned read of a C 'long'. Unaligned
- reads are more expensive, better to defer to another iteration. */
- if (!((size_t) q & LONG_PTR_MASK)) {
- /* Fast path for runs of non-surrogate chars. */
- register const unsigned char *_q = q;
+ native_ordering = bo >= 0;
+#endif
+
+ /* Note: size will always be longer than the resulting Unicode
+ character count */
+ unicode = PyUnicode_New(size, 127);
+ if (!unicode)
+ return NULL;
+ outpos = 0;
+
+ while (1) {
+ Py_UCS4 ch = 0;
+ if (e - q > 1) {
+ const unsigned char *e2 = e - 1;
int kind = PyUnicode_KIND(unicode);
- void *data = PyUnicode_DATA(unicode);
- while (_q < aligned_end) {
- unsigned long block = * (unsigned long *) _q;
- Py_UCS4 maxch;
- if (native_ordering) {
- /* Can use buffer directly */
- if (block & FAST_CHAR_MASK)
- break;
- }
- else {
- /* Need to byte-swap */
- if (block & SWAPPED_FAST_CHAR_MASK)
- break;
- block = ((block >> 8) & STRIPPED_MASK) |
- ((block & STRIPPED_MASK) << 8);
- }
- maxch = (Py_UCS2)(block & 0xFFFF);
-#if SIZEOF_LONG == 8
- ch = (Py_UCS2)((block >> 16) & 0xFFFF);
- maxch = MAX_MAXCHAR(maxch, ch);
- ch = (Py_UCS2)((block >> 32) & 0xFFFF);
- maxch = MAX_MAXCHAR(maxch, ch);
- ch = (Py_UCS2)(block >> 48);
- maxch = MAX_MAXCHAR(maxch, ch);
-#else
- ch = (Py_UCS2)(block >> 16);
- maxch = MAX_MAXCHAR(maxch, ch);
-#endif
- if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
- if (unicode_widen(&unicode, outpos, maxch) < 0)
- goto onError;
- kind = PyUnicode_KIND(unicode);
- data = PyUnicode_DATA(unicode);
- }
-#ifdef BYTEORDER_IS_LITTLE_ENDIAN
- PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block &
0xFFFF));
-#if SIZEOF_LONG == 8
- PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 16)
& 0xFFFF));
- PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 32)
& 0xFFFF));
- PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >>
48)));
-#else
- PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block >> 16));
-#endif
-#else
-#if SIZEOF_LONG == 8
- PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >>
48)));
- PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 32)
& 0xFFFF));
- PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 16)
& 0xFFFF));
-#else
- PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block >> 16));
-#endif
- PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block &
0xFFFF));
-#endif
- _q += SIZEOF_LONG;
- }
- q = _q;
- if (q >= e)
- break;
- }
- ch = (q[ihi] << 8) | q[ilo];
-
- q += 2;
-
- if (!Py_UNICODE_IS_SURROGATE(ch)) {
+ if (kind == PyUnicode_1BYTE_KIND) {
+ if (PyUnicode_IS_ASCII(unicode))
+ ch = asciilib_utf16_try_decode(
+ PyUnicode_1BYTE_DATA(unicode), &outpos,
+ &q, e2, native_ordering);
+ else
+ ch = ucs1lib_utf16_try_decode(
+ PyUnicode_1BYTE_DATA(unicode), &outpos,
+ &q, e2, native_ordering);
+ } else if (kind == PyUnicode_2BYTE_KIND) {
+ ch = ucs2lib_utf16_try_decode(
+ PyUnicode_2BYTE_DATA(unicode), &outpos,
+ &q, e2, native_ordering);
+ } else {
+ assert(kind == PyUnicode_4BYTE_KIND);
+ ch = ucs4lib_utf16_try_decode(
+ PyUnicode_4BYTE_DATA(unicode), &outpos,
+ &q, e2, native_ordering);
+ }
+ }
+ switch (ch)
+ {
+ case 0:
+ /* remaining byte at the end? (size should be even) */
+ if (q == e || consumed)
+ goto End;
+ errmsg = "truncated data";
+ startinpos = ((const char *)q) - starts;
+ endinpos = ((const char *)e) - starts;
+ break;
+ /* The remaining input chars are ignored if the callback
+ chooses to skip the input */
+ case 1:
+ errmsg = "unexpected end of data";
+ startinpos = ((const char *)q) - 2 - starts;
+ endinpos = ((const char *)e) - starts;
+ break;
+ case 2:
+ errmsg = "illegal encoding";
+ startinpos = ((const char *)q) - 2 - starts;
+ endinpos = startinpos + 2;
+ break;
+ case 3:
+ errmsg = "illegal UTF-16 surrogate";
+ startinpos = ((const char *)q) - 4 - starts;
+ endinpos = startinpos + 2;
+ break;
+ default:
if (unicode_putchar(&unicode, &outpos, ch) < 0)
goto onError;
continue;
}
- /* UTF-16 code pair: */
- if (q > e) {
- errmsg = "unexpected end of data";
- startinpos = (((const char *)q) - 2) - starts;
- endinpos = ((const char *)e) + 1 - starts;
- goto utf16Error;
- }
- if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
- Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo];
- q += 2;
- if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) {
- if (unicode_putchar(&unicode, &outpos,
- Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0)
- goto onError;
- continue;
- }
- else {
- errmsg = "illegal UTF-16 surrogate";
- startinpos = (((const char *)q)-4)-starts;
- endinpos = startinpos+2;
- goto utf16Error;
- }
-
- }
- errmsg = "illegal encoding";
- startinpos = (((const char *)q)-2)-starts;
- endinpos = startinpos+2;
- /* Fall through to report the error */
-
- utf16Error:
if (unicode_decode_call_errorhandler(
errors,
&errorHandler,
@@ -5698,30 +5611,8 @@
&outpos))
goto onError;
}
- /* remaining byte at the end? (size should be even) */
- if (e == q) {
- if (!consumed) {
- errmsg = "truncated data";
- startinpos = ((const char *)q) - starts;
- endinpos = ((const char *)e) + 1 - starts;
- if (unicode_decode_call_errorhandler(
- errors,
- &errorHandler,
- "utf16", errmsg,
- &starts,
- (const char **)&e,
- &startinpos,
- &endinpos,
- &exc,
- (const char **)&q,
- &unicode,
- &outpos))
- goto onError;
- /* The remaining input chars are ignored if the callback
- chooses to skip the input */
- }
- }
-
+
+End:
if (byteorder)
*byteorder = bo;
@@ -5743,9 +5634,6 @@
return NULL;
}
-#undef FAST_CHAR_MASK
-#undef SWAPPED_FAST_CHAR_MASK
-
PyObject *
_PyUnicode_EncodeUTF16(PyObject *str,
const char *errors,
_______________________________________________
Python-bugs-list mailing list
Unsubscribe:
http://mail.python.org/mailman/options/python-bugs-list/archive%40mail-archive.com