Author: Ronan Lamy <ronan.l...@gmail.com>
Branch: py3.5
Changeset: r94591:2396fb397495
Date: 2018-05-14 22:34 +0100
http://bitbucket.org/pypy/pypy/changeset/2396fb397495/

Log:    hg merge default

diff --git a/pypy/interpreter/unicodehelper.py 
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -168,6 +168,222 @@
     return decode_utf8(space, string, allow_surrogates=True)
 
 # ____________________________________________________________
+# utf-16
+
+def str_decode_utf_16(s, size, errors, final=True,
+                           errorhandler=None):
+    result, length, byteorder = str_decode_utf_16_helper(s, size, errors, 
final,
+                                                         errorhandler, 
"native",
+                                                         'utf-16-' + 
BYTEORDER2)
+    return result, length
+
+def str_decode_utf_16_be(s, size, errors, final=True,
+                              errorhandler=None):
+    result, length, byteorder = str_decode_utf_16_helper(s, size, errors, 
final,
+                                                         errorhandler, "big",
+                                                         'utf-16-be')
+    return result, length
+
+def str_decode_utf_16_le(s, size, errors, final=True,
+                              errorhandler=None):
+    result, length, byteorder = str_decode_utf_16_helper(s, size, errors, 
final,
+                                                         errorhandler, 
"little",
+                                                         'utf-16-le')
+    return result, length
+
+def str_decode_utf_16_helper(s, size, errors, final=True,
+                             errorhandler=None,
+                             byteorder="native",
+                             public_encoding_name='utf16'):
+    if errorhandler is None:
+        errorhandler = default_unicode_error_decode
+    bo = 0
+
+    if BYTEORDER == 'little':
+        ihi = 1
+        ilo = 0
+    else:
+        ihi = 0
+        ilo = 1
+
+    #  Check for BOM marks (U+FEFF) in the input and adjust current
+    #  byte order setting accordingly. In native mode, the leading BOM
+    #  mark is skipped, in all other modes, it is copied to the output
+    #  stream as-is (giving a ZWNBSP character).
+    pos = 0
+    if byteorder == 'native':
+        if size >= 2:
+            bom = (ord(s[ihi]) << 8) | ord(s[ilo])
+            if BYTEORDER == 'little':
+                if bom == 0xFEFF:
+                    pos += 2
+                    bo = -1
+                elif bom == 0xFFFE:
+                    pos += 2
+                    bo = 1
+            else:
+                if bom == 0xFEFF:
+                    pos += 2
+                    bo = 1
+                elif bom == 0xFFFE:
+                    pos += 2
+                    bo = -1
+    elif byteorder == 'little':
+        bo = -1
+    else:
+        bo = 1
+    if size == 0:
+        return u'', 0, bo
+    if bo == -1:
+        # force little endian
+        ihi = 1
+        ilo = 0
+
+    elif bo == 1:
+        # force big endian
+        ihi = 0
+        ilo = 1
+
+    result = UnicodeBuilder(size // 2)
+
+    #XXX I think the errors are not correctly handled here
+    while pos < size:
+        # remaining bytes at the end? (size should be even)
+        if len(s) - pos < 2:
+            if not final:
+                break
+            r, pos = errorhandler(errors, public_encoding_name,
+                                  "truncated data",
+                                  s, pos, len(s))
+            result.append(r)
+            if len(s) - pos < 2:
+                break
+        ch = (ord(s[pos + ihi]) << 8) | ord(s[pos + ilo])
+        pos += 2
+        if ch < 0xD800 or ch > 0xDFFF:
+            result.append(unichr(ch))
+            continue
+        # UTF-16 code pair:
+        if len(s) - pos < 2:
+            pos -= 2
+            if not final:
+                break
+            errmsg = "unexpected end of data"
+            r, pos = errorhandler(errors, public_encoding_name,
+                                  errmsg, s, pos, len(s))
+            result.append(r)
+            if len(s) - pos < 2:
+                break
+        elif 0xD800 <= ch <= 0xDBFF:
+            ch2 = (ord(s[pos+ihi]) << 8) | ord(s[pos+ilo])
+            pos += 2
+            if 0xDC00 <= ch2 <= 0xDFFF:
+                if MAXUNICODE < 65536:
+                    result.append(unichr(ch))
+                    result.append(unichr(ch2))
+                else:
+                    result.append(UNICHR((((ch & 0x3FF)<<10) |
+                                           (ch2 & 0x3FF)) + 0x10000))
+                continue
+            else:
+                r, pos = errorhandler(errors, public_encoding_name,
+                                      "illegal UTF-16 surrogate",
+                                      s, pos - 4, pos - 2)
+                result.append(r)
+        else:
+            r, pos = errorhandler(errors, public_encoding_name,
+                                  "illegal encoding",
+                                  s, pos - 2, pos)
+            result.append(r)
+    return result.build(), pos, bo
+
+def _STORECHAR(result, CH, byteorder):
+    hi = chr(((CH) >> 8) & 0xff)
+    lo = chr((CH) & 0xff)
+    if byteorder == 'little':
+        result.append(lo)
+        result.append(hi)
+    else:
+        result.append(hi)
+        result.append(lo)
+
+def unicode_encode_utf_16_helper(s, size, errors,
+                                 errorhandler=None,
+                                 allow_surrogates=True,
+                                 byteorder='little',
+                                 public_encoding_name='utf16'):
+    if errorhandler is None:
+        errorhandler = default_unicode_error_encode
+    if size == 0:
+        if byteorder == 'native':
+            result = StringBuilder(2)
+            _STORECHAR(result, 0xFEFF, BYTEORDER)
+            return result.build()
+        return ""
+
+    result = StringBuilder(size * 2 + 2)
+    if byteorder == 'native':
+        _STORECHAR(result, 0xFEFF, BYTEORDER)
+        byteorder = BYTEORDER
+
+    pos = 0
+    while pos < size:
+        ch = ord(s[pos])
+        pos += 1
+
+        if ch < 0xD800:
+            _STORECHAR(result, ch, byteorder)
+        elif ch >= 0x10000:
+            _STORECHAR(result, 0xD800 | ((ch-0x10000) >> 10), byteorder)
+            _STORECHAR(result, 0xDC00 | ((ch-0x10000) & 0x3FF), byteorder)
+        elif ch >= 0xE000 or allow_surrogates:
+            _STORECHAR(result, ch, byteorder)
+        else:
+            ru, rs, pos = errorhandler(errors, public_encoding_name,
+                                       'surrogates not allowed',
+                                       s, pos-1, pos)
+            if rs is not None:
+                # py3k only
+                if len(rs) % 2 != 0:
+                    errorhandler('strict', public_encoding_name,
+                                 'surrogates not allowed',
+                                 s, pos-1, pos)
+                result.append(rs)
+                continue
+            for ch in ru:
+                if ord(ch) < 0xD800:
+                    _STORECHAR(result, ord(ch), byteorder)
+                else:
+                    errorhandler('strict', public_encoding_name,
+                                 'surrogates not allowed',
+                                 s, pos-1, pos)
+            continue
+
+    return result.build()
+
+def unicode_encode_utf_16(s, size, errors,
+                          errorhandler=None,
+                          allow_surrogates=True):
+    return unicode_encode_utf_16_helper(s, size, errors, errorhandler,
+                                        allow_surrogates, "native",
+                                        'utf-16-' + BYTEORDER2)
+
+def unicode_encode_utf_16_be(s, size, errors,
+                             errorhandler=None,
+                             allow_surrogates=True):
+    return unicode_encode_utf_16_helper(s, size, errors, errorhandler,
+                                        allow_surrogates, "big",
+                                        'utf-16-be')
+
+def unicode_encode_utf_16_le(s, size, errors,
+                             errorhandler=None,
+                             allow_surrogates=True):
+    return unicode_encode_utf_16_helper(s, size, errors, errorhandler,
+                                        allow_surrogates, "little",
+                                        'utf-16-le')
+
+
+# ____________________________________________________________
 # utf-32
 
 def str_decode_utf_32(s, size, errors, final=True,
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to