Author: Armin Rigo <[email protected]>
Branch: unicode-utf8-re
Changeset: r93347:58b6fedc39bc
Date: 2017-12-10 08:27 +0100
http://bitbucket.org/pypy/pypy/changeset/58b6fedc39bc/
Log: hg merge unicode-utf8
diff --git a/TODO b/TODO
--- a/TODO
+++ b/TODO
@@ -12,3 +12,4 @@
* improve performance of splitlines
* fix _pypyjson to not use a wrapped dict when decoding an object
+* make sure we review all the places that call ord(unichr) to check for
ValueErrors
\ No newline at end of file
diff --git a/pypy/interpreter/test/test_unicodehelper.py
b/pypy/interpreter/test/test_unicodehelper.py
--- a/pypy/interpreter/test/test_unicodehelper.py
+++ b/pypy/interpreter/test/test_unicodehelper.py
@@ -1,3 +1,4 @@
+import pytest
from hypothesis import given, strategies
from rpython.rlib import rutf8
@@ -5,6 +6,7 @@
from pypy.interpreter.unicodehelper import str_decode_utf8
from pypy.interpreter.unicodehelper import utf8_encode_ascii, str_decode_ascii
from pypy.interpreter import unicodehelper as uh
+from pypy.module._codecs.interp_codecs import CodecState
def decode_utf8(u):
return str_decode_utf8(u, True, "strict", None)
@@ -68,3 +70,16 @@
def test_unicode_escape(u):
r = uh.utf8_encode_unicode_escape(u.encode("utf8"), "strict", None)
assert r == u.encode("unicode-escape")
+
+def test_encode_decimal(space):
+ assert uh.unicode_encode_decimal(u' 12, 34 ', None) == ' 12, 34 '
+ with pytest.raises(ValueError):
+ uh.unicode_encode_decimal(u' 12, \u1234 '.encode('utf8'), None)
+ state = space.fromcache(CodecState)
+ handler = state.encode_error_handler
+ assert uh.unicode_encode_decimal(
+ u'u\u1234\u1235v'.encode('utf8'), 'replace', handler) == 'u??v'
+
+ result = uh.unicode_encode_decimal(
+ u'12\u1234'.encode('utf8'), 'xmlcharrefreplace', handler)
+ assert result == '12ሴ'
diff --git a/pypy/interpreter/unicodehelper.py
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -1,11 +1,13 @@
import sys
-from pypy.interpreter.error import OperationError
+from pypy.interpreter.error import OperationError, oefmt
from rpython.rlib.objectmodel import specialize
from rpython.rlib import rutf8
from rpython.rlib.rarithmetic import r_uint, intmask
from rpython.rlib.rstring import StringBuilder
+from rpython.rtyper.lltypesystem import rffi
from pypy.module._codecs import interp_codecs
+from pypy.module.unicodedata import unicodedb
@specialize.memo()
def decode_error_handler(space):
@@ -34,6 +36,16 @@
space.newtext(msg)]))
return raise_unicode_exception_encode
+def default_error_encode(
+ errors, encoding, msg, u, startingpos, endingpos):
+ """A default handler, for tests"""
+ assert endingpos >= 0
+ if errors == 'replace':
+ return '?', endingpos
+ if errors == 'ignore':
+ return '', endingpos
+ raise ValueError
+
def convert_arg_to_w_unicode(space, w_arg, strict=None):
return space.convert_arg_to_w_unicode(w_arg)
@@ -204,7 +216,7 @@
if c > 0x7F:
errorhandler("strict", 'ascii',
'ordinal not in range(128)', utf8,
- pos, pos + 1)
+ pos, pos + 1)
j = rutf8.next_codepoint_pos(r, j)
pos = newpos
res.append(r)
@@ -530,6 +542,19 @@
return builder.build(), pos, outsize
+def wcharpsize2utf8(space, wcharp, size):
+ """Safe version of rffi.wcharpsize2utf8.
+
+ Raises app-level ValueError if any wchar value is outside the valid
+ codepoint range.
+ """
+ try:
+ return rffi.wcharpsize2utf8(wcharp, size)
+ except ValueError:
+ raise oefmt(space.w_ValueError,
+ "character is not in range [U+0000; U+10ffff]")
+
+
# ____________________________________________________________
# Raw unicode escape
@@ -575,8 +600,8 @@
digits = 4 if s[pos] == 'u' else 8
message = "truncated \\uXXXX"
pos += 1
- pos, _, _ = hexescape(result, s, pos, digits,
- "rawunicodeescape", errorhandler, message, errors)
+ pos, _ = hexescape(result, s, pos, digits,
+ "rawunicodeescape", errorhandler, message, errors)
r = result.build()
lgt = rutf8.check_utf8(r, True)
@@ -1073,22 +1098,19 @@
elif ch >= 0xE000 or allow_surrogates:
_STORECHAR(result, ch, byteorder)
else:
- ru, newindex = errorhandler(errors, public_encoding_name,
- 'surrogates not allowed',
- s, pos-1, pos)
- for j in range(newindex - index):
- pos = rutf8.next_codepoint_pos(s, pos)
- j = 0
- while j < len(ru):
- ch = rutf8.codepoint_at_pos(ru, j)
- if ord(ch) < 0xD800:
- _STORECHAR(result, ord(ch), byteorder)
+ res_8, newindex = errorhandler(
+ errors, public_encoding_name, 'surrogates not allowed',
+ s, pos - 1, pos)
+ for cp in rutf8.Utf8StringIterator(res_8):
+ if cp < 0xD800:
+ _STORECHAR(result, cp, byteorder)
else:
errorhandler('strict', public_encoding_name,
'surrogates not allowed',
s, pos-1, pos)
- j = rutf8.next_codepoint_pos(ru, j)
- index = newindex
+ if index != newindex: # Should be uncommon
+ index = newindex
+ pos = rutf8._pos_at_index(s, newindex)
continue
pos = rutf8.next_codepoint_pos(s, pos)
@@ -1257,22 +1279,19 @@
ch = rutf8.codepoint_at_pos(s, pos)
pos = rutf8.next_codepoint_pos(s, pos)
if not allow_surrogates and 0xD800 <= ch < 0xE000:
- ru, newindex = errorhandler(errors, public_encoding_name,
- 'surrogates not allowed',
- s, pos-1, pos)
- for j in range(newindex - index):
- pos = rutf8.next_codepoint_pos(s, pos)
- j = 0
- while j < len(ru):
- ch = rutf8.codepoint_at_pos(ru, j)
- if ord(ch) < 0xD800:
- _STORECHAR32(result, ord(ch), byteorder)
+ res_8, newindex = errorhandler(
+ errors, public_encoding_name, 'surrogates not allowed',
+ s, pos - 1, pos)
+ for ch in rutf8.Utf8StringIterator(res_8):
+ if ch < 0xD800:
+ _STORECHAR32(result, ch, byteorder)
else:
- errorhandler('strict', public_encoding_name,
- 'surrogates not allowed',
- s, pos-1, pos)
- j = rutf8.next_codepoint_pos(ru, j)
- index = newindex
+ errorhandler(
+ 'strict', public_encoding_name, 'surrogates not
allowed',
+ s, pos - 1, pos)
+ if index != newindex: # Should be uncommon
+ index = newindex
+ pos = rutf8._pos_at_index(s, newindex)
continue
_STORECHAR32(result, ch, byteorder)
index += 1
@@ -1400,8 +1419,7 @@
lgt = rutf8.check_utf8(r, True)
return r, pos, lgt
-def utf8_encode_charmap(s, errors, errorhandler=None,
- mapping=None):
+def utf8_encode_charmap(s, errors, errorhandler=None, mapping=None):
size = len(s)
if mapping is None:
return utf8_encode_latin_1(s, errors, errorhandler=errorhandler)
@@ -1413,34 +1431,99 @@
index = 0
while pos < size:
ch = rutf8.codepoint_at_pos(s, pos)
-
c = mapping.get(ch, '')
if len(c) == 0:
- # collect all unencodable chars. Important for narrow builds.
- collend = rutf8.next_codepoint_pos(s, pos)
- endindex = index + 1
- while collend < size and mapping.get(rutf8.codepoint_at_pos(s,
collend), '') == '':
- collend = rutf8.next_codepoint_pos(s, collend)
- endindex += 1
- rs, endindex = errorhandler(errors, "charmap",
+ # collect all unencodable chars.
+ startindex = index
+ pos = rutf8.next_codepoint_pos(s, pos)
+ index += 1
+ while (pos < size and
+ mapping.get(rutf8.codepoint_at_pos(s, pos), '') == ''):
+ pos = rutf8.next_codepoint_pos(s, pos)
+ index += 1
+ res_8, newindex = errorhandler(errors, "charmap",
"character maps to <undefined>",
- s, index, endindex)
- j = 0
- for _ in range(endindex - index):
- ch2 = rutf8.codepoint_at_pos(rs, j)
- ch2 = mapping.get(ch2, '')
+ s, startindex, index)
+ for cp2 in rutf8.Utf8StringIterator(res_8):
+ ch2 = mapping.get(cp2, '')
if not ch2:
errorhandler(
- "strict", "charmap",
- "character maps to <undefined>",
- s, index, index + 1)
+ "strict", "charmap", "character maps to <undefined>",
+ s, startindex, index)
result.append(ch2)
- index += 1
- j = rutf8.next_codepoint_pos(rs, j)
- pos = rutf8.next_codepoint_pos(s, pos)
+ if index != newindex: # Should be uncommon
+ index = newindex
+ pos = rutf8._pos_at_index(s, newindex)
continue
result.append(c)
index += 1
pos = rutf8.next_codepoint_pos(s, pos)
return result.build()
+# ____________________________________________________________
+# Decimal Encoder
+def unicode_encode_decimal(s, errors, errorhandler=None):
+ """Converts whitespace to ' ', decimal characters to their
+ corresponding ASCII digit and all other Latin-1 characters except
+ \0 as-is. Characters outside this range (Unicode ordinals 1-256)
+ are treated as errors. This includes embedded NULL bytes.
+ """
+ if errorhandler is None:
+ errorhandler = default_error_encode
+ result = StringBuilder(len(s))
+ pos = 0
+ i = 0
+ it = rutf8.Utf8StringIterator(s)
+ for ch in it:
+ if unicodedb.isspace(ch):
+ result.append(' ')
+ i += 1
+ continue
+ try:
+ decimal = unicodedb.decimal(ch)
+ except KeyError:
+ pass
+ else:
+ result.append(chr(48 + decimal))
+ i += 1
+ continue
+ if 0 < ch < 256:
+ result.append(chr(ch))
+ i += 1
+ continue
+ # All other characters are considered unencodable
+ start_index = i
+ i += 1
+ while not it.done():
+ ch = rutf8.codepoint_at_pos(s, it.get_pos())
+ try:
+ if (0 < ch < 256 or unicodedb.isspace(ch) or
+ unicodedb.decimal(ch) >= 0):
+ break
+ except KeyError:
+ # not a decimal
+ pass
+ if it.done():
+ break
+ ch = next(it)
+ i += 1
+ end_index = i
+ msg = "invalid decimal Unicode string"
+ r, pos = errorhandler(
+ errors, 'decimal', msg, s, start_index, end_index)
+ for ch in rutf8.Utf8StringIterator(r):
+ if unicodedb.isspace(ch):
+ result.append(' ')
+ continue
+ try:
+ decimal = unicodedb.decimal(ch)
+ except KeyError:
+ pass
+ else:
+ result.append(chr(48 + decimal))
+ continue
+ if 0 < ch < 256:
+ result.append(chr(ch))
+ continue
+ errorhandler('strict', 'decimal', msg, s, start_index, end_index)
+ return result.build()
diff --git a/pypy/module/_codecs/interp_codecs.py
b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -70,9 +70,6 @@
raise oefmt(space.w_IndexError,
"position %d from error handler out of bounds",
newpos)
- if newpos < startpos:
- raise oefmt(space.w_IndexError,
- "position %d from error handler did not progress", newpos)
w_replace = space.convert_to_w_unicode(w_replace)
return w_replace._utf8, newpos
return call_errorhandler
@@ -226,7 +223,7 @@
w_end = space.getattr(w_exc, space.newtext('end'))
end = space.int_w(w_end)
start = w_obj._index_to_byte(start)
- end = w_obj._index_to_byte(end)
+ end = w_obj._index_to_byte(end)
builder = StringBuilder()
pos = start
obj = w_obj._utf8
@@ -460,22 +457,12 @@
# utf-8 functions are not regular, because we have to pass
# "allow_surrogates=True"
-@unwrap_spec(utf8='utf8', errors='text_or_none')
-def utf_8_encode(space, utf8, errors="strict"):
- length, _ = rutf8.check_utf8(utf8, allow_surrogates=True)
- return space.newtuple([space.newbytes(utf8), space.newint(length)])
-#@unwrap_spec(uni=unicode, errors='text_or_none')
-#def utf_8_encode(space, uni, errors="strict"):
-# if errors is None:
-# errors = 'strict'
-# state = space.fromcache(CodecState)
-# # NB. can't call unicode_encode_utf_8() directly because that's
-# # an @elidable function nowadays. Instead, we need the _impl().
-# # (The problem is the errorhandler, which calls arbitrary Python.)
-# result = runicode.unicode_encode_utf_8_impl(
-# uni, len(uni), errors, state.encode_error_handler,
-# allow_surrogates=True)
-# return space.newtuple([space.newbytes(result), space.newint(len(uni))])
+@unwrap_spec(errors='text_or_none')
+def utf_8_encode(space, w_obj, errors="strict"):
+ utf8, lgt = space.utf8_len_w(w_obj)
+ if rutf8.has_surrogates(utf8):
+ utf8 = rutf8.reencode_utf8_with_surrogates(utf8)
+ return space.newtuple([space.newbytes(utf8), space.newint(lgt)])
@unwrap_spec(string='bufferstr', errors='text_or_none',
w_final = WrappedDefault(False))
diff --git a/pypy/module/_codecs/test/test_codecs.py
b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -537,8 +537,12 @@
assert '\xff'.decode('utf-7', 'ignore') == ''
assert '\x00'.decode('unicode-internal', 'ignore') == ''
- def test_backslahreplace(self):
- assert u'a\xac\u1234\u20ac\u8000'.encode('ascii', 'backslashreplace')
== 'a\\xac\u1234\u20ac\u8000'
+ def test_backslashreplace(self):
+ sin = u"a\xac\u1234\u20ac\u8000\U0010ffff"
+ expected = "a\\xac\\u1234\\u20ac\\u8000\\U0010ffff"
+ assert sin.encode('ascii', 'backslashreplace') == expected
+ expected = "a\xac\\u1234\xa4\\u8000\\U0010ffff"
+ assert sin.encode("iso-8859-15", "backslashreplace") == expected
def test_badhandler(self):
import codecs
diff --git a/pypy/module/_io/interp_stringio.py
b/pypy/module/_io/interp_stringio.py
--- a/pypy/module/_io/interp_stringio.py
+++ b/pypy/module/_io/interp_stringio.py
@@ -1,3 +1,5 @@
+from rpython.rlib.rutf8 import get_utf8_length
+
from pypy.interpreter.error import OperationError, oefmt
from pypy.interpreter.typedef import (
TypeDef, generic_new_descr, GetSetProperty)
@@ -152,7 +154,7 @@
if self.readnl is None:
w_readnl = space.w_None
else:
- w_readnl = space.str(space.new_from_utf8(self.readnl)) # YYY
+ w_readnl = space.str(space.newutf8(self.readnl,
get_utf8_length(self.readnl))) # YYY
return space.newtuple([
w_initialval, w_readnl, space.newint(self.buf.pos), w_dict
])
@@ -215,7 +217,8 @@
if self.writenl:
w_decoded = space.call_method(
w_decoded, "replace",
- space.newtext("\n"), space.new_from_utf8(self.writenl))
+ space.newtext("\n"), space.newutf8(self.writenl,
+ get_utf8_length(self.writenl)))
string = space.utf8_w(w_decoded)
if string:
self.buf.write(string)
@@ -225,7 +228,9 @@
def read_w(self, space, w_size=None):
self._check_closed(space)
size = convert_size(space, w_size)
- return space.new_from_utf8(self.buf.read(size))
+ v = self.buf.read(size)
+ lgt = get_utf8_length(v)
+ return space.newutf8(v, lgt)
def readline_w(self, space, w_limit=None):
self._check_closed(space)
@@ -239,7 +244,8 @@
else:
newline = self.readnl
result = self.buf.readline(newline, limit)
- return space.new_from_utf8(result)
+ resultlen = get_utf8_length(result)
+ return space.newutf8(result, resultlen)
@unwrap_spec(pos=int, mode=int)
@@ -276,7 +282,9 @@
def getvalue_w(self, space):
self._check_closed(space)
- return space.new_from_utf8(self.buf.getvalue())
+ v = self.buf.getvalue()
+ lgt = get_utf8_length(v)
+ return space.newutf8(v, lgt)
def readable_w(self, space):
self._check_closed(space)
diff --git a/pypy/module/_io/interp_textio.py b/pypy/module/_io/interp_textio.py
--- a/pypy/module/_io/interp_textio.py
+++ b/pypy/module/_io/interp_textio.py
@@ -12,7 +12,8 @@
from rpython.rlib.rbigint import rbigint
from rpython.rlib.rstring import StringBuilder
from rpython.rlib.rutf8 import (check_utf8, next_codepoint_pos,
- codepoints_in_utf8)
+ codepoints_in_utf8, get_utf8_length,
+ Utf8StringBuilder)
STATE_ZERO, STATE_OK, STATE_DETACHED = range(3)
@@ -684,13 +685,15 @@
w_bytes = space.call_method(self.w_buffer, "read")
w_decoded = space.call_method(self.w_decoder, "decode", w_bytes,
space.w_True)
check_decoded(space, w_decoded)
- w_result = space.new_from_utf8(self.decoded.get_chars(-1))
+ chars = self.decoded.get_chars(-1)
+ lgt = get_utf8_length(chars)
+ w_result = space.newutf8(chars, lgt)
w_final = space.add(w_result, w_decoded)
self.snapshot = None
return w_final
remaining = size
- builder = StringBuilder(size)
+ builder = Utf8StringBuilder(size)
# Keep reading chunks until we have n characters to return
while remaining > 0:
@@ -700,7 +703,7 @@
builder.append(data)
remaining -= len(data)
- return space.new_from_utf8(builder.build())
+ return space.newutf8(builder.build(), builder.get_length())
def _scan_line_ending(self, limit):
if self.readuniversal:
@@ -725,6 +728,7 @@
limit = convert_size(space, w_limit)
remnant = None
builder = StringBuilder()
+ # XXX maybe use Utf8StringBuilder instead?
while True:
# First, get some data if necessary
has_data = self._ensure_data(space)
@@ -771,7 +775,8 @@
self.decoded.reset()
result = builder.build()
- return space.new_from_utf8(result)
+ lgt = get_utf8_length(result)
+ return space.newutf8(result, lgt)
# _____________________________________________________________
# write methods
@@ -794,8 +799,8 @@
if text.find('\n') >= 0:
haslf = True
if haslf and self.writetranslate and self.writenl:
- w_text = space.call_method(w_text, "replace",
space.new_from_utf8('\n'),
- space.new_from_utf8(self.writenl))
+ w_text = space.call_method(w_text, "replace", space.newutf8('\n',
1),
+ space.newutf8(self.writenl,
get_utf8_length(self.writenl)))
text = space.utf8_w(w_text)
needflush = False
diff --git a/pypy/module/_locale/interp_locale.py
b/pypy/module/_locale/interp_locale.py
--- a/pypy/module/_locale/interp_locale.py
+++ b/pypy/module/_locale/interp_locale.py
@@ -133,10 +133,11 @@
rffi.free_charp(s1_c)
rffi.free_charp(s2_c)
- s1, s2 = space.unicode_w(w_s1), space.unicode_w(w_s2)
+ s1, l1 = space.utf8_len_w(w_s1)
+ s2, l2 = space.utf8_len_w(w_s2)
- s1_c = rffi.unicode2wcharp(s1)
- s2_c = rffi.unicode2wcharp(s2)
+ s1_c = rffi.utf82wcharp(s1, l1)
+ s2_c = rffi.utf82wcharp(s2, l2)
try:
result = _wcscoll(s1_c, s2_c)
finally:
diff --git a/pypy/module/_multibytecodec/c_codecs.py
b/pypy/module/_multibytecodec/c_codecs.py
--- a/pypy/module/_multibytecodec/c_codecs.py
+++ b/pypy/module/_multibytecodec/c_codecs.py
@@ -157,7 +157,7 @@
replace, end = errorcb(errors, namecb, reason,
stringdata, start, end)
# 'replace' is RPython unicode here
- lgt, _ = rutf8.check_utf8(replace, True)
+ lgt = rutf8.get_utf8_length(replace)
inbuf = rffi.utf82wcharp(replace, lgt)
try:
r = pypy_cjk_dec_replace_on_error(decodebuf, inbuf, lgt, end)
@@ -268,7 +268,7 @@
rets, end = errorcb(errors, namecb, reason,
unicodedata, start, end)
codec = pypy_cjk_enc_getcodec(encodebuf)
- lgt, _ = rutf8.get_utf8_length_flag(rets)
+ lgt = rutf8.get_utf8_length(rets)
replace = encode(codec, rets, lgt, "strict", errorcb, namecb)
with rffi.scoped_nonmovingbuffer(replace) as inbuf:
r = pypy_cjk_enc_replace_on_error(encodebuf, inbuf, len(replace), end)
diff --git a/pypy/module/_multibytecodec/interp_incremental.py
b/pypy/module/_multibytecodec/interp_incremental.py
--- a/pypy/module/_multibytecodec/interp_incremental.py
+++ b/pypy/module/_multibytecodec/interp_incremental.py
@@ -66,7 +66,7 @@
pos = c_codecs.pypy_cjk_dec_inbuf_consumed(self.decodebuf)
assert 0 <= pos <= len(object)
self.pending = object[pos:]
- lgt = rutf8.get_utf8_length_flag(output)
+ lgt = rutf8.get_utf8_length(output)
return space.newutf8(output, lgt)
diff --git a/pypy/module/_multibytecodec/interp_multibytecodec.py
b/pypy/module/_multibytecodec/interp_multibytecodec.py
--- a/pypy/module/_multibytecodec/interp_multibytecodec.py
+++ b/pypy/module/_multibytecodec/interp_multibytecodec.py
@@ -27,8 +27,8 @@
raise wrap_unicodedecodeerror(space, e, input, self.name)
except RuntimeError:
raise wrap_runtimeerror(space)
- lgt, flag = rutf8.check_utf8(utf8_output, True)
- return space.newtuple([space.newutf8(utf8_output, lgt, flag),
+ lgt = rutf8.get_utf8_length(utf8_output)
+ return space.newtuple([space.newutf8(utf8_output, lgt),
space.newint(len(input))])
@unwrap_spec(errors="text_or_none")
diff --git a/pypy/module/_multibytecodec/test/test_translation.py
b/pypy/module/_multibytecodec/test/test_translation.py
--- a/pypy/module/_multibytecodec/test/test_translation.py
+++ b/pypy/module/_multibytecodec/test/test_translation.py
@@ -14,7 +14,7 @@
codecname, string = argv[1], argv[2]
c = c_codecs.getcodec(codecname)
u = c_codecs.decode(c, string)
- lgt, _ = rutf8.get_utf8_length_flag(u)
+ lgt = rutf8.get_utf8_length(u)
r = c_codecs.encode(c, u, lgt)
print r
return 0
diff --git a/pypy/module/_pypyjson/interp_decoder.py
b/pypy/module/_pypyjson/interp_decoder.py
--- a/pypy/module/_pypyjson/interp_decoder.py
+++ b/pypy/module/_pypyjson/interp_decoder.py
@@ -3,6 +3,7 @@
from rpython.rlib.objectmodel import specialize, always_inline, r_dict
from rpython.rlib import rfloat, runicode, rutf8
from rpython.rtyper.lltypesystem import lltype, rffi
+from rpython.rlib.rarithmetic import r_uint
from pypy.interpreter.error import oefmt
from pypy.interpreter import unicodehelper
@@ -366,7 +367,7 @@
return # help the annotator to know that we'll never go beyond
# this point
#
- utf8_ch = rutf8.unichr_as_utf8(val, allow_surrogates=True)
+ utf8_ch = rutf8.unichr_as_utf8(r_uint(val), allow_surrogates=True)
builder.append(utf8_ch)
return i
@@ -400,7 +401,7 @@
break
elif ch == '\\' or ch < '\x20':
self.pos = i-1
- return self.space.unicode_w(self.decode_string_escaped(start))
+ return self.decode_string_escaped(start)
strhash = intmask((1000003 * strhash) ^ ord(ll_chars[i]))
bits |= ord(ch)
length = i - start - 1
diff --git a/pypy/module/_rawffi/alt/type_converter.py
b/pypy/module/_rawffi/alt/type_converter.py
--- a/pypy/module/_rawffi/alt/type_converter.py
+++ b/pypy/module/_rawffi/alt/type_converter.py
@@ -128,7 +128,7 @@
intval: lltype.Signed
"""
self.error(w_ffitype, w_obj)
-
+
def handle_unichar(self, w_ffitype, w_obj, intval):
"""
intval: lltype.Signed
@@ -174,7 +174,7 @@
def handle_struct_rawffi(self, w_ffitype, w_structinstance):
"""
This method should be killed as soon as we remove support for _rawffi
structures
-
+
w_structinstance: W_StructureInstance
"""
self.error(w_ffitype, w_structinstance)
@@ -227,7 +227,7 @@
ucharval = self.get_char(w_ffitype)
return space.newbytes(chr(ucharval))
elif w_ffitype.is_unichar():
- wcharval = self.get_unichar(w_ffitype)
+ wcharval = r_uint(self.get_unichar(w_ffitype))
return space.newutf8(rutf8.unichr_as_utf8(wcharval), 1)
elif w_ffitype.is_double():
return self._float(w_ffitype)
@@ -349,7 +349,7 @@
def get_struct_rawffi(self, w_ffitype, w_structdescr):
"""
This should be killed as soon as we kill support for _rawffi structures
-
+
Return type: lltype.Unsigned
(the address of the structure)
"""
diff --git a/pypy/module/_rawffi/interp_rawffi.py
b/pypy/module/_rawffi/interp_rawffi.py
--- a/pypy/module/_rawffi/interp_rawffi.py
+++ b/pypy/module/_rawffi/interp_rawffi.py
@@ -448,7 +448,8 @@
elif c == 'c':
return space.newbytes(func(add_arg, argdesc, ll_type))
elif c == 'u':
- return space.newunicode(func(add_arg, argdesc, ll_type))
+ return space.newutf8(rutf8.unichr_as_utf8(
+ ord(func(add_arg, argdesc, ll_type))), 1)
elif c == 'f' or c == 'd' or c == 'g':
return space.newfloat(float(func(add_arg, argdesc, ll_type)))
else:
@@ -596,10 +597,10 @@
return space.w_None
wcharp_addr = rffi.cast(rffi.CWCHARP, address)
if maxlength == -1:
- s = rffi.wcharp2utf8(wcharp_addr)
+ s, lgt = rffi.wcharp2utf8(wcharp_addr)
else:
- s = rffi.wcharpsize2utf8(wcharp_addr, maxlength)
- return space.newunicode(s)
+ s, lgt = rffi.wcharp2utf8n(wcharp_addr, maxlength)
+ return space.newutf8(s, lgt)
@unwrap_spec(address=r_uint, maxlength=int)
def charp2rawstring(space, address, maxlength=-1):
@@ -612,8 +613,8 @@
def wcharp2rawunicode(space, address, maxlength=-1):
if maxlength == -1:
return wcharp2unicode(space, address)
- s = rffi.wcharpsize2unicode(rffi.cast(rffi.CWCHARP, address), maxlength)
- return space.newunicode(s)
+ s = rffi.wcharpsize2utf8(rffi.cast(rffi.CWCHARP, address), maxlength)
+ return space.newutf8(s, maxlength)
@unwrap_spec(address=r_uint, newcontent='bufferstr')
def rawstring2charp(space, address, newcontent):
diff --git a/pypy/module/array/interp_array.py
b/pypy/module/array/interp_array.py
--- a/pypy/module/array/interp_array.py
+++ b/pypy/module/array/interp_array.py
@@ -1,7 +1,7 @@
from rpython.rlib import jit, rgc, rutf8
from rpython.rlib.buffer import RawBuffer
from rpython.rlib.objectmodel import keepalive_until_here
-from rpython.rlib.rarithmetic import ovfcheck, widen
+from rpython.rlib.rarithmetic import ovfcheck, widen, r_uint
from rpython.rlib.unroll import unrolling_iterable
from rpython.rtyper.annlowlevel import llstr
from rpython.rtyper.lltypesystem import lltype, rffi
@@ -380,6 +380,7 @@
if len(s) % self.itemsize != 0:
raise oefmt(self.space.w_ValueError,
"string length not a multiple of item size")
+ self.check_valid_unicode(space, s) # empty for non-u arrays
oldlen = self.len
new = len(s) / self.itemsize
if not new:
@@ -451,7 +452,7 @@
"""
if self.typecode == 'u':
buf = rffi.cast(UNICODE_ARRAY, self._buffer_as_unsigned())
- return space.newutf8(rffi.wcharpsize2unicode(buf, self.len))
+ return space.newutf8(rffi.wcharpsize2utf8(buf, self.len), self.len)
else:
raise oefmt(space.w_ValueError,
"tounicode() may only be called on type 'u' arrays")
@@ -710,6 +711,9 @@
s = "array('%s', %s)" % (self.typecode, space.text_w(r))
return space.newtext(s)
+ def check_valid_unicode(self, space, s):
+ pass # overwritten by u
+
W_ArrayBase.typedef = TypeDef(
'array.array',
__new__ = interp2app(w_array),
@@ -870,6 +874,18 @@
def get_buffer(self):
return rffi.cast(mytype.arrayptrtype, self._buffer)
+ if mytype.unwrap == 'utf8_len_w':
+ def check_valid_unicode(self, space, s):
+ i = 0
+ while i < len(s):
+ if s[i] != '\x00' or ord(s[i + 1]) > 0x10:
+ v = ((ord(s[i]) << 24) + (ord(s[i + 1]) << 16) +
+ (ord(s[i + 2]) << 8) + ord(s[i + 3]))
+ raise oefmt(space.w_ValueError,
+ "Character U+%s is not in range [U+0000,
U+10ffff]",
+ hex(v)[2:])
+ i += 4
+
def item_w(self, w_item):
space = self.space
unwrap = getattr(space, mytype.unwrap)
@@ -1013,7 +1029,7 @@
elif mytype.typecode == 'c':
return space.newbytes(item)
elif mytype.typecode == 'u':
- code = ord(item)
+ code = r_uint(ord(item))
return space.newutf8(rutf8.unichr_as_utf8(code), 1)
assert 0, "unreachable"
diff --git a/pypy/module/array/test/test_array.py
b/pypy/module/array/test/test_array.py
--- a/pypy/module/array/test/test_array.py
+++ b/pypy/module/array/test/test_array.py
@@ -844,13 +844,7 @@
import sys
if sys.maxunicode == 0xffff:
skip("test for 32-bit unicodes")
- a = self.array('u', b'\xff\xff\xff\xff')
- assert len(a) == 1
- assert repr(a[0]) == "u'\Uffffffff'"
- if sys.maxint == 2147483647:
- assert ord(a[0]) == -1
- else:
- assert ord(a[0]) == 4294967295
+ raises(ValueError, self.array, 'u', b'\xff\xff\xff\xff')
def test_weakref(self):
import weakref
diff --git a/pypy/module/cpyext/longobject.py b/pypy/module/cpyext/longobject.py
--- a/pypy/module/cpyext/longobject.py
+++ b/pypy/module/cpyext/longobject.py
@@ -4,6 +4,7 @@
CONST_STRING, ADDR, CANNOT_FAIL)
from pypy.objspace.std.longobject import W_LongObject
from pypy.interpreter.error import OperationError
+from pypy.interpreter.unicodehelper import wcharpsize2utf8
from pypy.module.cpyext.intobject import PyInt_AsUnsignedLongMask
from rpython.rlib.rbigint import rbigint
@@ -191,7 +192,7 @@
string, length gives the number of characters, and base is the radix
for the conversion. The radix must be in the range [2, 36]; if it is
out of range, ValueError will be raised."""
- w_value = space.newunicode(rffi.wcharpsize2unicode(u, length))
+ w_value = space.newutf8(wcharpsize2utf8(space, u, length), length)
w_base = space.newint(rffi.cast(lltype.Signed, base))
return space.call_function(space.w_long, w_value, w_base)
diff --git a/pypy/module/cpyext/object.py b/pypy/module/cpyext/object.py
--- a/pypy/module/cpyext/object.py
+++ b/pypy/module/cpyext/object.py
@@ -246,7 +246,7 @@
the Python expression unicode(o). Called by the unicode() built-in
function."""
if w_obj is None:
- return space.newunicode(u"<NULL>")
+ return space.newutf8("<NULL>", 6)
return space.call_function(space.w_unicode, w_obj)
@cpython_api([PyObject, PyObject], rffi.INT_real, error=-1)
@@ -302,7 +302,7 @@
if opid == Py_EQ:
return 1
if opid == Py_NE:
- return 0
+ return 0
w_res = PyObject_RichCompare(space, w_o1, w_o2, opid_int)
return int(space.is_true(w_res))
diff --git a/pypy/module/cpyext/unicodeobject.py
b/pypy/module/cpyext/unicodeobject.py
--- a/pypy/module/cpyext/unicodeobject.py
+++ b/pypy/module/cpyext/unicodeobject.py
@@ -1,5 +1,11 @@
+from rpython.rtyper.lltypesystem import rffi, lltype
+from rpython.rlib import rstring, runicode
+from rpython.tool.sourcetools import func_renamer
+
from pypy.interpreter.error import OperationError, oefmt
-from rpython.rtyper.lltypesystem import rffi, lltype
+from pypy.interpreter.unicodehelper import (
+ wcharpsize2utf8, str_decode_utf_16_helper, str_decode_utf_32_helper,
+ unicode_encode_decimal)
from pypy.module.unicodedata import unicodedb
from pypy.module.cpyext.api import (
CANNOT_FAIL, Py_ssize_t, build_type_checkers_flags, cpython_api,
@@ -13,8 +19,6 @@
from pypy.module.sys.interp_encoding import setdefaultencoding
from pypy.module._codecs.interp_codecs import CodecState
from pypy.objspace.std import unicodeobject
-from rpython.rlib import rstring, runicode
-from rpython.tool.sourcetools import func_renamer
import sys
## See comment in bytesobject.py.
@@ -61,10 +65,10 @@
def unicode_attach(space, py_obj, w_obj, w_userdata=None):
"Fills a newly allocated PyUnicodeObject with a unicode string"
py_unicode = rffi.cast(PyUnicodeObject, py_obj)
- s = space.unicode_w(w_obj)
- py_unicode.c_length = len(s)
+ s, length = space.utf8_len_w(w_obj)
+ py_unicode.c_length = length
py_unicode.c_str = lltype.nullptr(rffi.CWCHARP.TO)
- py_unicode.c_hash = space.hash_w(space.newunicode(s))
+ py_unicode.c_hash = space.hash_w(space.newutf8(s, length))
py_unicode.c_defenc = lltype.nullptr(PyObject.TO)
def unicode_realize(space, py_obj):
@@ -73,11 +77,12 @@
be modified after this call.
"""
py_uni = rffi.cast(PyUnicodeObject, py_obj)
- s = rffi.wcharpsize2unicode(py_uni.c_str, py_uni.c_length)
+ length = py_uni.c_length
+ s = wcharpsize2utf8(space, py_uni.c_str, length)
w_type = from_ref(space, rffi.cast(PyObject, py_obj.c_ob_type))
w_obj = space.allocate_instance(unicodeobject.W_UnicodeObject, w_type)
- w_obj.__init__(s)
- py_uni.c_hash = space.hash_w(space.newunicode(s))
+ w_obj.__init__(s, length)
+ py_uni.c_hash = space.hash_w(space.newutf8(s, length))
track_reference(space, py_obj, w_obj)
return w_obj
@@ -214,8 +219,8 @@
if not ref_unicode.c_str:
# Copy unicode buffer
w_unicode = from_ref(space, rffi.cast(PyObject, ref))
- u = space.unicode_w(w_unicode)
- ref_unicode.c_str = rffi.unicode2wcharp(u)
+ u, length = space.utf8_len_w(w_unicode)
+ ref_unicode.c_str = rffi.utf82wcharp(u, length)
return ref_unicode.c_str
@cpython_api([PyObject], rffi.CWCHARP)
@@ -335,8 +340,8 @@
Therefore, modification of the resulting Unicode object is only allowed
when u
is NULL."""
if wchar_p:
- s = rffi.wcharpsize2unicode(wchar_p, length)
- return make_ref(space, space.newunicode(s))
+ s = wcharpsize2utf8(space, wchar_p, length)
+ return make_ref(space, space.newutf8(s, length))
else:
return rffi.cast(PyObject, new_empty_unicode(space, length))
@@ -506,7 +511,8 @@
"""Encode the Py_UNICODE buffer of the given size and return a
Python string object. Return NULL if an exception was raised
by the codec."""
- w_u = space.newunicode(rffi.wcharpsize2unicode(s, size))
+ u = wcharpsize2utf8(space, s, size)
+ w_u = space.newutf8(u, size)
if errors:
w_errors = space.newtext(rffi.charp2str(errors))
else:
@@ -564,15 +570,11 @@
else:
errors = None
- result, length, byteorder = runicode.str_decode_utf_16_helper(
- string, size, errors,
- True, # final ? false for multiple passes?
- None, # errorhandler
- byteorder)
+ result, _, length, byteorder = str_decode_utf_16_helper(
+ string, errors, final=True, errorhandler=None, byteorder=byteorder)
if pbyteorder is not None:
pbyteorder[0] = rffi.cast(rffi.INT, byteorder)
-
- return space.newunicode(result)
+ return space.newutf8(result, length)
@cpython_api([CONST_STRING, Py_ssize_t, CONST_STRING, rffi.INTP], PyObject)
def PyUnicode_DecodeUTF32(space, s, size, llerrors, pbyteorder):
@@ -620,15 +622,11 @@
else:
errors = None
- result, length, byteorder = runicode.str_decode_utf_32_helper(
- string, size, errors,
- True, # final ? false for multiple passes?
- None, # errorhandler
- byteorder)
+ result, _, length, byteorder = str_decode_utf_32_helper(
+ string, errors, final=True, errorhandler=None, byteorder=byteorder)
if pbyteorder is not None:
pbyteorder[0] = rffi.cast(rffi.INT, byteorder)
-
- return space.newunicode(result)
+ return space.newutf8(result, length)
@cpython_api([rffi.CWCHARP, Py_ssize_t, rffi.CCHARP, CONST_STRING],
rffi.INT_real, error=-1)
@@ -646,14 +644,13 @@
Returns 0 on success, -1 on failure.
"""
- u = rffi.wcharpsize2unicode(s, length)
+ u = rffi.wcharpsize2utf8(s, length)
if llerrors:
errors = rffi.charp2str(llerrors)
else:
errors = None
state = space.fromcache(CodecState)
- result = runicode.unicode_encode_decimal(u, length, errors,
- state.encode_error_handler)
+ result = unicode_encode_decimal(u, errors, state.encode_error_handler)
i = len(result)
output[i] = '\0'
i -= 1
@@ -706,12 +703,17 @@
"""Return 1 if substr matches str[start:end] at the given tail end
(direction == -1 means to do a prefix match, direction == 1 a
suffix match), 0 otherwise. Return -1 if an error occurred."""
- str = space.unicode_w(w_str)
- substr = space.unicode_w(w_substr)
+ space.utf8_w(w_str) # type check
+ space.utf8_w(w_substr)
+ w_start = space.newint(start)
+ w_end = space.newint(end)
if rffi.cast(lltype.Signed, direction) <= 0:
- return rstring.startswith(str, substr, start, end)
+ w_result = space.call_method(
+ w_str, "startswith", w_substr, w_start, w_end)
else:
- return rstring.endswith(str, substr, start, end)
+ w_result = space.call_method(
+ w_str, "endswith", w_substr, w_start, w_end)
+ return space.int_w(w_result)
@cpython_api([PyObject, PyObject, Py_ssize_t, Py_ssize_t], Py_ssize_t,
error=-1)
def PyUnicode_Count(space, w_str, w_substr, start, end):
diff --git a/pypy/module/pyexpat/interp_pyexpat.py
b/pypy/module/pyexpat/interp_pyexpat.py
--- a/pypy/module/pyexpat/interp_pyexpat.py
+++ b/pypy/module/pyexpat/interp_pyexpat.py
@@ -483,7 +483,7 @@
except rutf8.CheckError:
from pypy.interpreter import unicodehelper
# get the correct error msg
- unicodehelper.str_decode_utf8(s, len(s), 'string', True,
+ unicodehelper.str_decode_utf8(s, 'string', True,
unicodehelper.decode_error_handler(space))
assert False, "always raises"
else:
@@ -587,21 +587,22 @@
def UnknownEncodingHandler(self, space, name, info):
# Yes, supports only 8bit encodings
- translationmap = space.unicode_w(
+ translationmap, lgt = space.utf8_len_w(
space.call_method(
space.newbytes(self.all_chars), "decode",
space.newtext(name), space.newtext("replace")))
- if len(translationmap) != 256:
+ if lgt != 256:
raise oefmt(space.w_ValueError,
"multi-byte encodings are not supported")
- for i in range(256):
- c = translationmap[i]
- if c == u'\ufffd':
+ i = 0
+ for c in rutf8.Utf8StringIterator(translationmap):
+ if c == 0xfffd:
info.c_map[i] = rffi.cast(rffi.INT, -1)
else:
info.c_map[i] = rffi.cast(rffi.INT, c)
+ i += 1
info.c_data = lltype.nullptr(rffi.VOIDP.TO)
info.c_convert = lltype.nullptr(rffi.VOIDP.TO)
info.c_release = lltype.nullptr(rffi.VOIDP.TO)
diff --git a/pypy/module/struct/formatiterator.py
b/pypy/module/struct/formatiterator.py
--- a/pypy/module/struct/formatiterator.py
+++ b/pypy/module/struct/formatiterator.py
@@ -1,6 +1,6 @@
from rpython.rlib.rarithmetic import (r_uint, r_ulonglong, r_longlong,
maxint, intmask)
-from rpython.rlib import jit
+from rpython.rlib import jit, rutf8
from rpython.rlib.objectmodel import specialize
from rpython.rlib.rstruct.error import StructError
from rpython.rlib.rstruct.formatiterator import FormatIterator
@@ -107,7 +107,7 @@
def accept_unicode_arg(self):
w_obj = self.accept_obj_arg()
- return self.space.unicode_w(w_obj)
+ return self.space.utf8_len_w(w_obj)
def accept_float_arg(self):
w_obj = self.accept_obj_arg()
@@ -191,6 +191,10 @@
assert 0, "unreachable"
self.result_w.append(w_value)
+ def append_utf8(self, value):
+ w_ch = self.space.newutf8(rutf8.unichr_as_utf8(r_uint(value)), 1)
+ self.result_w.append(w_ch)
+
def get_pos(self):
return self.pos
diff --git a/pypy/module/unicodedata/interp_ucd.py
b/pypy/module/unicodedata/interp_ucd.py
--- a/pypy/module/unicodedata/interp_ucd.py
+++ b/pypy/module/unicodedata/interp_ucd.py
@@ -7,11 +7,8 @@
from pypy.interpreter.error import OperationError, oefmt
from pypy.interpreter.typedef import TypeDef, interp_attrproperty
from rpython.rlib.rarithmetic import r_longlong
-from rpython.rlib.objectmodel import we_are_translated
-from rpython.rlib.runicode import MAXUNICODE
from rpython.rlib.unicodedata import unicodedb_5_2_0, unicodedb_3_2_0
-from rpython.rlib.runicode import code_to_unichr, ord_accepts_surrogate
-import sys
+from rpython.rlib.rutf8 import Utf8StringBuilder, unichr_as_utf8
# Contants for Hangul characters
@@ -30,49 +27,17 @@
# unicode code point.
-if MAXUNICODE > 0xFFFF:
- # Target is wide build
- def unichr_to_code_w(space, w_unichr):
- if not space.isinstance_w(w_unichr, space.w_unicode):
- raise oefmt(
- space.w_TypeError, 'argument 1 must be unicode, not %T',
- w_unichr)
+# Target is wide build
+def unichr_to_code_w(space, w_unichr):
+ if not space.isinstance_w(w_unichr, space.w_unicode):
+ raise oefmt(
+ space.w_TypeError, 'argument 1 must be unicode, not %T',
+ w_unichr)
- if not we_are_translated() and sys.maxunicode == 0xFFFF:
- # Host CPython is narrow build, accept surrogates
- try:
- return ord_accepts_surrogate(space.unicode_w(w_unichr))
- except TypeError:
- raise oefmt(space.w_TypeError,
- "need a single Unicode character as parameter")
- else:
- if not space.len_w(w_unichr) == 1:
- raise oefmt(space.w_TypeError,
- "need a single Unicode character as parameter")
- return space.int_w(space.ord(w_unichr))
-
-else:
- # Target is narrow build
- def unichr_to_code_w(space, w_unichr):
- if not space.isinstance_w(w_unichr, space.w_unicode):
- raise oefmt(
- space.w_TypeError, 'argument 1 must be unicode, not %T',
- w_unichr)
-
- if not we_are_translated() and sys.maxunicode > 0xFFFF:
- # Host CPython is wide build, forbid surrogates
- if not space.len_w(w_unichr) == 1:
- raise oefmt(space.w_TypeError,
- "need a single Unicode character as parameter")
- return space.int_w(space.ord(w_unichr))
-
- else:
- # Accept surrogates
- try:
- return ord_accepts_surrogate(space.unicode_w(w_unichr))
- except TypeError:
- raise oefmt(space.w_TypeError,
- "need a single Unicode character as parameter")
+ if not space.len_w(w_unichr) == 1:
+ raise oefmt(space.w_TypeError,
+ "need a single Unicode character as parameter")
+ return space.int_w(space.ord(w_unichr))
class UCD(W_Root):
@@ -110,7 +75,8 @@
except KeyError:
msg = space.mod(space.newtext("undefined character name '%s'"),
space.newtext(name))
raise OperationError(space.w_KeyError, msg)
- return space.newunicode(code_to_unichr(code))
+ assert code >= 0
+ return space.newutf8(unichr_as_utf8(code), 1)
def name(self, space, w_unichr, w_default=None):
code = unichr_to_code_w(space, w_unichr)
@@ -259,10 +225,10 @@
result[0] = ch
if not composed: # If decomposed normalization we are done
- return space.newunicode(u''.join([unichr(i) for i in result[:j]]))
+ return self.build(space, result, stop=j)
if j <= 1:
- return space.newunicode(u''.join([unichr(i) for i in result[:j]]))
+ return self.build(space, result, stop=j)
current = result[0]
starter_pos = 0
@@ -310,7 +276,13 @@
result[starter_pos] = current
- return space.newunicode(u''.join([unichr(i) for i in
result[:next_insert]]))
+ return self.build(space, result, stop=next_insert)
+
+ def build(self, space, r, stop):
+ builder = Utf8StringBuilder(stop * 3)
+ for i in range(stop):
+ builder.append_code(r[i])
+ return space.newutf8(builder.build(), stop)
methods = {}
diff --git a/pypy/module/unicodedata/test/test_hyp.py
b/pypy/module/unicodedata/test/test_hyp.py
--- a/pypy/module/unicodedata/test/test_hyp.py
+++ b/pypy/module/unicodedata/test/test_hyp.py
@@ -1,3 +1,4 @@
+
import pytest
try:
from hypothesis import given, strategies as st, example, settings
@@ -5,12 +6,14 @@
pytest.skip("hypothesis required")
from pypy.module.unicodedata.interp_ucd import ucd
+from rpython.rlib.rutf8 import get_utf8_length
def make_normalization(space, NF_code):
def normalize(s):
- w_s = space.newunicode(s)
+ u = s.encode('utf8')
+ w_s = space.newutf8(u, get_utf8_length(u))
w_res = ucd.normalize(space, NF_code, w_s)
- return space.unicode_w(w_res)
+ return space.utf8_w(w_res).decode('utf8')
return normalize
all_forms = ['NFC', 'NFD', 'NFKC', 'NFKD']
diff --git a/pypy/objspace/fake/objspace.py b/pypy/objspace/fake/objspace.py
--- a/pypy/objspace/fake/objspace.py
+++ b/pypy/objspace/fake/objspace.py
@@ -212,9 +212,6 @@
def newutf8(self, x, l):
return w_some_obj()
- def new_from_utf8(self, a):
- return w_some_obj()
-
def newunicode(self, a):
return w_some_obj()
diff --git a/pypy/objspace/std/formatting.py b/pypy/objspace/std/formatting.py
--- a/pypy/objspace/std/formatting.py
+++ b/pypy/objspace/std/formatting.py
@@ -3,7 +3,7 @@
from rpython.rlib import jit, rutf8
from rpython.rlib.objectmodel import specialize
-from rpython.rlib.rarithmetic import INT_MAX
+from rpython.rlib.rarithmetic import INT_MAX, r_uint
from rpython.rlib.rfloat import DTSF_ALT, formatd, isnan, isinf
from rpython.rlib.rstring import StringBuilder
from rpython.rlib.unroll import unrolling_iterable
@@ -330,7 +330,7 @@
space = self.space
if do_unicode:
cp = rutf8.codepoint_at_pos(self.fmt, self.fmtpos - 1)
- w_s = space.newutf8(rutf8.unichr_as_utf8(cp), 1)
+ w_s = space.newutf8(rutf8.unichr_as_utf8(r_uint(cp)), 1)
else:
cp = ord(self.fmt[self.fmtpos - 1])
w_s = space.newbytes(chr(cp))
@@ -466,7 +466,7 @@
n = space.int_w(w_value)
if do_unicode:
try:
- c = rutf8.unichr_as_utf8(n)
+ c = rutf8.unichr_as_utf8(r_uint(n))
except ValueError:
raise oefmt(space.w_OverflowError,
"unicode character code out of range")
diff --git a/pypy/objspace/std/test/test_unicodeobject.py
b/pypy/objspace/std/test/test_unicodeobject.py
--- a/pypy/objspace/std/test/test_unicodeobject.py
+++ b/pypy/objspace/std/test/test_unicodeobject.py
@@ -741,6 +741,8 @@
assert u'\u20ac'.encode('utf-8') == '\xe2\x82\xac'
assert u'\ud800\udc02'.encode('utf-8') == '\xf0\x90\x80\x82'
assert u'\ud84d\udc56'.encode('utf-8') == '\xf0\xa3\x91\x96'
+ assert u'\ud800\udc02'.encode('uTf-8') == '\xf0\x90\x80\x82'
+ assert u'\ud84d\udc56'.encode('Utf8') == '\xf0\xa3\x91\x96'
assert u'\ud800'.encode('utf-8') == '\xed\xa0\x80'
assert u'\udc00'.encode('utf-8') == '\xed\xb0\x80'
assert (u'\ud800\udc02'*1000).encode('utf-8') ==
'\xf0\x90\x80\x82'*1000
diff --git a/rpython/annotator/unaryop.py b/rpython/annotator/unaryop.py
--- a/rpython/annotator/unaryop.py
+++ b/rpython/annotator/unaryop.py
@@ -792,7 +792,7 @@
def ord(self):
# warning, on 32-bit with 32-bit unichars, this might return
# negative numbers
- return SomeInteger()
+ return SomeInteger(nonneg=True)
class __extend__(SomeIterator):
diff --git a/rpython/rlib/rstruct/nativefmttable.py
b/rpython/rlib/rstruct/nativefmttable.py
--- a/rpython/rlib/rstruct/nativefmttable.py
+++ b/rpython/rlib/rstruct/nativefmttable.py
@@ -4,7 +4,7 @@
"""
import struct
-from rpython.rlib import jit, longlong2float
+from rpython.rlib import rutf8, longlong2float
from rpython.rlib.objectmodel import specialize
from rpython.rlib.rarithmetic import r_singlefloat, widen, intmask
from rpython.rlib.rstruct import standardfmttable as std
@@ -139,17 +139,17 @@
from rpython.rlib.rstruct import unichar
def pack_unichar(fmtiter):
- unistr = fmtiter.accept_unicode_arg()
- if len(unistr) != 1:
+ utf8, lgt = fmtiter.accept_unicode_arg()
+ if lgt != 1:
raise StructError("expected a unicode string of length 1")
- c = unistr[0] # string->char conversion for the annotator
- unichar.pack_unichar(c, fmtiter.wbuf, fmtiter.pos)
+ uchr = rutf8.codepoint_at_pos(utf8, 0)
+ unichar.pack_codepoint(uchr, fmtiter.wbuf, fmtiter.pos)
fmtiter.advance(unichar.UNICODE_SIZE)
@specialize.argtype(0)
def unpack_unichar(fmtiter):
data = fmtiter.read(unichar.UNICODE_SIZE)
- fmtiter.appendobj(unichar.unpack_unichar(data))
+ fmtiter.append_utf8(unichar.unpack_codepoint(data))
native_fmttable['u'] = {'size': unichar.UNICODE_SIZE,
'alignment': unichar.UNICODE_SIZE,
diff --git a/rpython/rlib/rstruct/unichar.py b/rpython/rlib/rstruct/unichar.py
--- a/rpython/rlib/rstruct/unichar.py
+++ b/rpython/rlib/rstruct/unichar.py
@@ -3,12 +3,8 @@
"""
import sys
-from rpython.rlib.runicode import MAXUNICODE
-if MAXUNICODE <= 65535:
- UNICODE_SIZE = 2
-else:
- UNICODE_SIZE = 4
+UNICODE_SIZE = 4
BIGENDIAN = sys.byteorder == "big"
def pack_unichar(unich, buf, pos):
@@ -34,7 +30,7 @@
buf.setitem(pos+2, chr((unich >> 16) & 0xFF))
buf.setitem(pos+3, chr(unich >> 24))
-def unpack_unichar(rawstring):
+def unpack_codepoint(rawstring):
assert len(rawstring) == UNICODE_SIZE
if UNICODE_SIZE == 2:
if BIGENDIAN:
@@ -54,4 +50,7 @@
ord(rawstring[1]) << 8 |
ord(rawstring[2]) << 16 |
ord(rawstring[3]) << 24)
- return unichr(n)
+ return n
+
+def unpack_unichar(rawstring):
+ return unichr(unpack_codepoint(rawstring))
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -19,7 +19,7 @@
from rpython.rlib.objectmodel import enforceargs, we_are_translated, specialize
from rpython.rlib.objectmodel import always_inline, dont_inline, try_inline
from rpython.rlib.rstring import StringBuilder
-from rpython.rlib import jit
+from rpython.rlib import jit, types
from rpython.rlib.signature import signature
from rpython.rlib.types import char, none
from rpython.rlib.rarithmetic import r_uint
@@ -27,6 +27,8 @@
from rpython.rtyper.lltypesystem import lltype, rffi
+# we need a way to accept both r_uint and int(nonneg=True)
+#@signature(types.int_nonneg(), types.bool(), returns=types.str())
def unichr_as_utf8(code, allow_surrogates=False):
"""Encode code (numeric value) as utf8 encoded string
"""
@@ -437,7 +439,7 @@
low = codepoint_at_pos(utf8, i)
if 0xDC00 <= low <= 0xDFFF:
uchr = 0x10000 + (high - 0xD800) * 0x400 + (low - 0xDC00)
- i = next_codepoint_pos(utf8, i)
+ i = next_codepoint_pos(utf8, i)
# else not really a surrogate pair, just append high
else:
i = next_codepoint_pos(utf8, i)
@@ -535,6 +537,13 @@
else:
return next_codepoint_pos(utf8, next_codepoint_pos(utf8, bytepos))
+def _pos_at_index(utf8, index):
+ # Slow!
+ pos = 0
+ for _ in range(index):
+ pos = next_codepoint_pos(utf8, pos)
+ return pos
+
@jit.dont_look_inside
def codepoint_at_index(utf8, storage, index):
""" Return codepoint of a character inside utf8 encoded string, given
diff --git a/rpython/rlib/types.py b/rpython/rlib/types.py
--- a/rpython/rlib/types.py
+++ b/rpython/rlib/types.py
@@ -26,6 +26,8 @@
def int():
return model.SomeInteger()
+def int_nonneg():
+ return model.SomeInteger(nonneg=True)
def bool():
return model.SomeBool()
diff --git a/rpython/rtyper/lltypesystem/rffi.py
b/rpython/rtyper/lltypesystem/rffi.py
--- a/rpython/rtyper/lltypesystem/rffi.py
+++ b/rpython/rtyper/lltypesystem/rffi.py
@@ -1019,7 +1019,27 @@
s = StringBuilder(size)
for i in range(size):
rutf8.unichr_as_utf8_append(s, ord(w[i]))
- return s.build()
+ return s.build()
+
+def wcharp2utf8(w):
+ from rpython.rlib import rutf8
+
+ s = rutf8.Utf8StringBuilder()
+ i = 0
+ while ord(w[i]):
+ s.append_code(ord(w[i]))
+ i += 1
+ return s.build(), i
+
+def wcharp2utf8n(w, maxlen):
+ from rpython.rlib import rutf8
+
+ s = rutf8.Utf8StringBuilder(maxlen)
+ i = 0
+ while i < maxlen and w[i]:
+ s.append_code(ord(w[i]))
+ i += 1
+ return s.build(), i
def utf82wcharp(utf8, utf8len):
from rpython.rlib import rutf8
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit