Author: Amaury Forgeot d'Arc <[email protected]>
Branch: py3k
Changeset: r57535:6367f38e2321
Date: 2012-09-24 23:44 +0200
http://bitbucket.org/pypy/pypy/changeset/6367f38e2321/
Log: hg merge default
diff --git a/pypy/module/_codecs/interp_codecs.py
b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -455,7 +455,6 @@
"ascii_encode",
"latin_1_encode",
"utf_7_encode",
- "utf_8_encode",
"utf_16_encode",
"utf_16_be_encode",
"utf_16_le_encode",
@@ -472,7 +471,6 @@
"ascii_decode",
"latin_1_decode",
"utf_7_decode",
- "utf_8_decode",
"utf_16_decode",
"utf_16_be_decode",
"utf_16_le_decode",
@@ -487,6 +485,30 @@
make_encoder_wrapper('mbcs_encode')
make_decoder_wrapper('mbcs_decode')
+# utf-8 functions are not regular, because we have to pass
+# "allow_surrogates=True"
+@unwrap_spec(uni=unicode, errors='str_or_None')
+def utf_8_encode(space, uni, errors="strict"):
+ if errors is None:
+ errors = 'strict'
+ state = space.fromcache(CodecState)
+ result = runicode.unicode_encode_utf_8(
+ uni, len(uni), errors, state.encode_error_handler,
+ allow_surrogates=True)
+ return space.newtuple([space.wrap(result), space.wrap(len(uni))])
+
+@unwrap_spec(string='bufferstr', errors='str_or_None')
+def utf_8_decode(space, string, errors="strict", w_final=False):
+ if errors is None:
+ errors = 'strict'
+ final = space.is_true(w_final)
+ state = space.fromcache(CodecState)
+ result, consumed = runicode.str_decode_utf_8(
+ string, len(string), errors,
+ final, state.decode_error_handler,
+ allow_surrogates=True)
+ return space.newtuple([space.wrap(result), space.wrap(consumed)])
+
@unwrap_spec(data="bufferstr", errors='str_or_None', byteorder=int)
def utf_16_ex_decode(space, data, errors='strict', byteorder=0, w_final=False):
if errors is None:
diff --git a/pypy/objspace/std/unicodeobject.py
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -12,7 +12,7 @@
from pypy.rlib.objectmodel import compute_hash, specialize
from pypy.rlib.objectmodel import compute_unique_id
from pypy.rlib.rstring import UnicodeBuilder
-from pypy.rlib.runicode import unicode_escape_nonprintable
+from pypy.rlib.runicode import make_unicode_escape_function
from pypy.module.unicodedata import unicodedb
from pypy.tool.sourcetools import func_with_new_name
from pypy.rlib import jit
@@ -854,10 +854,13 @@
space.wrap("character mapping must return integer, None or
unicode"))
return W_UnicodeObject(u''.join(result))
+_repr_function, _ = make_unicode_escape_function(
+ pass_printable=True, unicode_output=True, quotes=True, prefix='')
+
def repr__Unicode(space, w_unicode):
chars = w_unicode._value
size = len(chars)
- s = unicode_escape_nonprintable(chars, size, "strict", quotes=True)
+ s = _repr_function(chars, size, "strict")
return space.wrap(s)
def mod__Unicode_ANY(space, w_format, w_values):
diff --git a/pypy/objspace/std/unicodetype.py b/pypy/objspace/std/unicodetype.py
--- a/pypy/objspace/std/unicodetype.py
+++ b/pypy/objspace/std/unicodetype.py
@@ -258,15 +258,13 @@
# XXX error handling
s = space.bufferstr_w(w_obj)
eh = decode_error_handler(space)
- return space.wrap(str_decode_ascii(s, len(s), None,
- final=True,
- errorhandler=eh)[0])
+ return space.wrap(str_decode_ascii(
+ s, len(s), None, final=True, errorhandler=eh)[0])
if encoding == 'utf-8':
s = space.bufferstr_w(w_obj)
eh = decode_error_handler(space)
- return space.wrap(str_decode_utf_8(s, len(s), None,
- final=True,
- errorhandler=eh)[0])
+ return space.wrap(str_decode_utf_8(
+ s, len(s), None, final=True, errorhandler=eh)[0])
w_codecs = space.getbuiltinmodule("_codecs")
w_decode = space.getattr(w_codecs, space.wrap("decode"))
if errors is None:
diff --git a/pypy/rlib/runicode.py b/pypy/rlib/runicode.py
--- a/pypy/rlib/runicode.py
+++ b/pypy/rlib/runicode.py
@@ -77,12 +77,14 @@
]
def str_decode_utf_8(s, size, errors, final=False,
- errorhandler=None):
+ errorhandler=None, allow_surrogates=False):
if errorhandler is None:
errorhandler = raise_unicode_exception_decode
- return str_decode_utf_8_impl(s, size, errors, final, errorhandler)
+ return str_decode_utf_8_impl(s, size, errors, final, errorhandler,
+ allow_surrogates=allow_surrogates)
-def str_decode_utf_8_impl(s, size, errors, final, errorhandler):
+def str_decode_utf_8_impl(s, size, errors, final, errorhandler,
+ allow_surrogates):
if size == 0:
return u'', 0
@@ -184,7 +186,7 @@
if (ordch2>>6 != 0x2 or # 0b10
(ordch1 == 0xe0 and ordch2 < 0xa0)
# surrogates shouldn't be valid UTF-8!
- or (ordch1 == 0xed and ordch2 > 0x9f)
+ or (not allow_surrogates and ordch1 == 0xed and ordch2 > 0x9f)
):
r, pos = errorhandler(errors, 'utf-8',
'invalid continuation byte',
@@ -253,12 +255,15 @@
result.append((chr((0x80 | ((ch >> 6) & 0x3f)))))
result.append((chr((0x80 | (ch & 0x3f)))))
-def unicode_encode_utf_8(s, size, errors, errorhandler=None):
+def unicode_encode_utf_8(s, size, errors, errorhandler=None,
+ allow_surrogates=False):
if errorhandler is None:
errorhandler = raise_unicode_exception_encode
- return unicode_encode_utf_8_impl(s, size, errors, errorhandler)
+ return unicode_encode_utf_8_impl(s, size, errors, errorhandler,
+ allow_surrogates=allow_surrogates)
-def unicode_encode_utf_8_impl(s, size, errors, errorhandler):
+def unicode_encode_utf_8_impl(s, size, errors, errorhandler,
+ allow_surrogates=False):
assert(size >= 0)
result = StringBuilder(size)
pos = 0
@@ -286,11 +291,19 @@
pos += 1
_encodeUCS4(result, ch3)
continue
- r, pos = errorhandler(errors, 'utf-8',
- 'surrogates not allowed',
- s, pos-1, pos)
- result.append(r)
- continue
+ if not allow_surrogates:
+ r, pos = errorhandler(errors, 'utf-8',
+ 'surrogates not allowed',
+ s, pos-1, pos)
+ for ch in r:
+ if ord(ch) < 0x80:
+ result.append(chr(ord(ch)))
+ else:
+ errorhandler('strict', 'utf-8',
+ 'surrogates not allowed',
+ s, pos-1, pos)
+ continue
+ # else: Fall through and handles isolated high surrogates
result.append((chr((0xe0 | (ch >> 12)))))
result.append((chr((0x80 | ((ch >> 6) & 0x3f)))))
result.append((chr((0x80 | (ch & 0x3f)))))
@@ -1210,13 +1223,15 @@
return builder.build(), pos
-def make_unicode_escape_function(for_repr=False):
+def make_unicode_escape_function(pass_printable=False, unicode_output=False,
+ quotes=False, prefix=None):
# Python3 has two similar escape functions: One to implement
# encode('unicode_escape') and which outputs bytes, and unicode.__repr__
# which outputs unicode. They cannot share RPython code, so we generate
# them with the template below.
+ # Python2 does not really need this, but it reduces diffs between branches.
- if for_repr:
+ if unicode_output:
STRING_BUILDER = UnicodeBuilder
STR = unicode
CHR = UNICHR
@@ -1225,11 +1240,13 @@
STR = str
CHR = chr
- def unicode_escape(s, size, errors, errorhandler=None, quotes=False):
+ def unicode_escape(s, size, errors, errorhandler=None):
# errorhandler is not used: this function cannot cause Unicode errors
result = STRING_BUILDER(size)
if quotes:
+ if prefix:
+ result.append(STR(prefix))
if s.find(u'\'') != -1 and s.find(u'\"') == -1:
quote = ord('\"')
result.append(STR('"'))
@@ -1282,9 +1299,9 @@
result.append(STR('\\\\'))
# Map non-printable or non-ascii to '\xhh' or '\uhhhh'
- elif for_repr and not unicodedb.isprintable(oc):
+ elif pass_printable and not unicodedb.isprintable(oc):
char_escape_helper(result, oc)
- elif not for_repr and (oc < 32 or oc >= 0x7F):
+ elif not pass_printable and (oc < 32 or oc >= 0x7F):
char_escape_helper(result, oc)
# Copy everything else as-is
@@ -1317,6 +1334,7 @@
return unicode_escape, char_escape_helper
+# This function is also used by _codecs/interp_codecs.py
(unicode_encode_unicode_escape, raw_unicode_escape_helper
) = make_unicode_escape_function()
(unicode_escape_nonprintable, _
_______________________________________________
pypy-commit mailing list
[email protected]
http://mail.python.org/mailman/listinfo/pypy-commit