Author: Ronan Lamy <[email protected]>
Branch: unicode-utf8-test
Changeset: r93322:33d09fc56c08
Date: 2017-12-08 13:28 +0000
http://bitbucket.org/pypy/pypy/changeset/33d09fc56c08/
Log: hg merge unicode-utf8
diff too long, truncating to 2000 out of 3186 lines
diff --git a/TODO b/TODO
--- a/TODO
+++ b/TODO
@@ -9,5 +9,6 @@
* remove assertions from W_UnicodeObject.__init__ if all the builders pass
* what to do with error handlers that go backwards. There were tests
in test_codecs that would check for that
+* improve performance of splitlines
* fix _pypyjson to not use a wrapped dict when decoding an object
diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py
--- a/pypy/interpreter/baseobjspace.py
+++ b/pypy/interpreter/baseobjspace.py
@@ -1087,8 +1087,11 @@
def newlist_utf8(self, list_u, is_ascii):
l_w = [None] * len(list_u)
for i, item in enumerate(list_u):
- length, flag = rutf8.check_utf8(item, True)
- l_w[i] = self.newutf8(item, length, flag)
+ if not is_ascii:
+ length = rutf8.check_utf8(item, True)
+ else:
+ length = len(item)
+ l_w[i] = self.newutf8(item, length)
return self.newlist(l_w)
def newlist_int(self, list_i):
diff --git a/pypy/interpreter/pyparser/parsestring.py
b/pypy/interpreter/pyparser/parsestring.py
--- a/pypy/interpreter/pyparser/parsestring.py
+++ b/pypy/interpreter/pyparser/parsestring.py
@@ -64,8 +64,8 @@
r = unicodehelper.decode_raw_unicode_escape(space, substr)
else:
r = unicodehelper.decode_unicode_escape(space, substr)
- v, length, flag = r
- return space.newutf8(v, length, flag)
+ v, length = r
+ return space.newutf8(v, length)
need_encoding = (encoding is not None and
encoding != "utf-8" and encoding != "utf8" and
@@ -74,8 +74,8 @@
substr = s[ps : q]
if rawmode or '\\' not in s[ps:]:
if need_encoding:
- lgt, flag = unicodehelper.check_utf8_or_raise(space, substr)
- w_u = space.newutf8(substr, lgt, flag)
+ lgt = unicodehelper.check_utf8_or_raise(space, substr)
+ w_u = space.newutf8(substr, lgt)
w_v = unicodehelper.encode(space, w_u, encoding)
return w_v
else:
@@ -234,8 +234,8 @@
p = ps
while p < end and ord(s[p]) & 0x80:
p += 1
- lgt, flag = unicodehelper.check_utf8_or_raise(space, s, ps, p)
- w_v = unicodehelper.encode(space, space.newutf8(s[ps:p], lgt, flag),
+ lgt = unicodehelper.check_utf8_or_raise(space, s, ps, p)
+ w_v = unicodehelper.encode(space, space.newutf8(s[ps:p], lgt),
recode_encoding)
v = space.bytes_w(w_v)
return v, p
diff --git a/pypy/interpreter/test/test_unicodehelper.py
b/pypy/interpreter/test/test_unicodehelper.py
--- a/pypy/interpreter/test/test_unicodehelper.py
+++ b/pypy/interpreter/test/test_unicodehelper.py
@@ -10,13 +10,13 @@
return str_decode_utf8(u, True, "strict", None)
def test_decode_utf8():
- assert decode_utf8("abc") == ("abc", 3, 3, rutf8.FLAG_ASCII)
- assert decode_utf8("\xe1\x88\xb4") == ("\xe1\x88\xb4", 3, 1,
rutf8.FLAG_REGULAR)
- assert decode_utf8("\xed\xa0\x80") == ("\xed\xa0\x80", 3, 1,
rutf8.FLAG_HAS_SURROGATES)
- assert decode_utf8("\xed\xb0\x80") == ("\xed\xb0\x80", 3, 1,
rutf8.FLAG_HAS_SURROGATES)
+ assert decode_utf8("abc") == ("abc", 3, 3)
+ assert decode_utf8("\xe1\x88\xb4") == ("\xe1\x88\xb4", 3, 1)
+ assert decode_utf8("\xed\xa0\x80") == ("\xed\xa0\x80", 3, 1)
+ assert decode_utf8("\xed\xb0\x80") == ("\xed\xb0\x80", 3, 1)
assert decode_utf8("\xed\xa0\x80\xed\xb0\x80") == (
- "\xed\xa0\x80\xed\xb0\x80", 6, 2, rutf8.FLAG_HAS_SURROGATES)
- assert decode_utf8("\xf0\x90\x80\x80") == ("\xf0\x90\x80\x80", 4, 1,
rutf8.FLAG_REGULAR)
+ "\xed\xa0\x80\xed\xb0\x80", 6, 2)
+ assert decode_utf8("\xf0\x90\x80\x80") == ("\xf0\x90\x80\x80", 4, 1)
def test_utf8_encode_ascii():
assert utf8_encode_ascii("abc", "??", "??") == "abc"
@@ -41,19 +41,19 @@
assert utf8_encode_ascii(u.encode("utf8"), "replace", eh) ==
u.encode("ascii", "replace")
def test_str_decode_ascii():
- assert str_decode_ascii("abc", "??", True, "??") == ("abc", 3, 3,
rutf8.FLAG_ASCII)
+ assert str_decode_ascii("abc", "??", True, "??") == ("abc", 3, 3)
def eh(errors, encoding, reason, p, start, end):
lst.append((errors, encoding, p, start, end))
return u"\u1234\u5678".encode("utf8"), end
lst = []
input = "\xe8"
exp = u"\u1234\u5678".encode("utf8")
- assert str_decode_ascii(input, "??", True, eh) == (exp, 1, 2,
rutf8.FLAG_REGULAR)
+ assert str_decode_ascii(input, "??", True, eh) == (exp, 1, 2)
assert lst == [("??", "ascii", input, 0, 1)]
lst = []
input = "\xe8\xe9abc\xea\xeb"
assert str_decode_ascii(input, "??", True, eh) == (
- exp + exp + "abc" + exp + exp, 7, 11, rutf8.FLAG_REGULAR)
+ exp + exp + "abc" + exp + exp, 7, 11)
assert lst == [("??", "ascii", input, 0, 1),
("??", "ascii", input, 1, 2),
("??", "ascii", input, 5, 6),
diff --git a/pypy/interpreter/unicodehelper.py
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -1,11 +1,11 @@
import sys
-from pypy.interpreter.error import OperationError
+from pypy.interpreter.error import OperationError, oefmt
from rpython.rlib.objectmodel import specialize
from rpython.rlib import rutf8
-from rpython.rlib.rutf8 import combine_flags
from rpython.rlib.rarithmetic import r_uint, intmask
from rpython.rlib.rstring import StringBuilder
+from rpython.rtyper.lltypesystem import rffi
from pypy.module._codecs import interp_codecs
@specialize.memo()
@@ -26,10 +26,10 @@
# Fast version of the "strict" errors handler.
def raise_unicode_exception_encode(errors, encoding, msg, utf8,
startingpos, endingpos):
- u_len, flag = rutf8.check_utf8(utf8, True)
+ u_len = rutf8.check_utf8(utf8, True)
raise OperationError(space.w_UnicodeEncodeError,
space.newtuple([space.newtext(encoding),
- space.newutf8(utf8, u_len, flag),
+ space.newutf8(utf8, u_len),
space.newint(startingpos),
space.newint(endingpos),
space.newtext(msg)]))
@@ -55,18 +55,18 @@
def decode_unicode_escape(space, string):
state = space.fromcache(interp_codecs.CodecState)
unicodedata_handler = state.get_unicodedata_handler(space)
- result_utf8, consumed, length, flag = str_decode_unicode_escape(
+ result_utf8, consumed, length = str_decode_unicode_escape(
string, "strict",
final=True,
errorhandler=decode_error_handler(space),
ud_handler=unicodedata_handler)
- return result_utf8, length, flag
+ return result_utf8, length
def decode_raw_unicode_escape(space, string):
- result_utf8, consumed, lgt, flag = str_decode_raw_unicode_escape(
+ result_utf8, consumed, lgt = str_decode_raw_unicode_escape(
string, "strict",
final=True, errorhandler=decode_error_handler(space))
- return result_utf8, lgt, flag
+ return result_utf8, lgt
def check_ascii_or_raise(space, string):
try:
@@ -83,19 +83,19 @@
# you still get two surrogate unicode characters in the result.
# These are the Python2 rules; Python3 differs.
try:
- length, flag = rutf8.check_utf8(string, True, start, end)
+ length = rutf8.check_utf8(string, True, start, end)
except rutf8.CheckError as e:
# convert position into unicode position
- lgt, flags = rutf8.check_utf8(string, True, start, stop=e.pos)
+ lgt = rutf8.check_utf8(string, True, start, stop=e.pos)
decode_error_handler(space)('strict', 'utf8', 'invalid utf-8', string,
start + lgt, start + lgt + 1)
assert False, "unreachable"
- return length, flag
+ return length
def str_decode_ascii(s, errors, final, errorhandler):
try:
rutf8.check_ascii(s)
- return s, len(s), len(s), rutf8.FLAG_ASCII
+ return s, len(s), len(s)
except rutf8.CheckError:
return _str_decode_ascii_slowpath(s, errors, final, errorhandler)
@@ -112,13 +112,13 @@
res.append(ch)
i += 1
ress = res.build()
- lgt, flag = rutf8.check_utf8(ress, True)
- return ress, len(s), lgt, flag
+ lgt = rutf8.check_utf8(ress, True)
+ return ress, len(s), lgt
def str_decode_latin_1(s, errors, final, errorhandler):
try:
rutf8.check_ascii(s)
- return s, len(s), len(s), rutf8.FLAG_ASCII
+ return s, len(s), len(s)
except rutf8.CheckError:
return _str_decode_latin_1_slowpath(s, errors, final, errorhandler)
@@ -138,7 +138,7 @@
res.append_slice(s, start, end)
i = end
# cannot be ASCII, cannot have surrogates, I believe
- return res.build(), len(s), len(s), rutf8.FLAG_REGULAR
+ return res.build(), len(s), len(s)
def utf8_encode_latin_1(s, errors, errorhandler):
try:
@@ -149,37 +149,32 @@
def _utf8_encode_latin_1_slowpath(s, errors, errorhandler):
res = StringBuilder(len(s))
- size = len(s)
cur = 0
- i = 0
- while i < size:
- if ord(s[i]) <= 0x7F:
- res.append(s[i])
- i += 1
+ iter = rutf8.Utf8StringIterator(s)
+ while True:
+ try:
+ ch = iter.next()
+ except StopIteration:
+ break
+ if ch <= 0xFF:
+ res.append(chr(ch))
cur += 1
else:
- oc = rutf8.codepoint_at_pos(s, i)
- if oc <= 0xFF:
- res.append(chr(oc))
- cur += 1
- i = rutf8.next_codepoint_pos(s, i)
- else:
- r, pos = errorhandler(errors, 'latin1',
- 'ordinal not in range(256)', s, cur,
- cur + 1)
- for j in range(pos - cur):
- i = rutf8.next_codepoint_pos(s, i)
+ r, pos = errorhandler(errors, 'latin1',
+ 'ordinal not in range(256)', s, cur,
+ cur + 1)
- j = 0
- while j < len(r):
- c = rutf8.codepoint_at_pos(r, j)
- if c > 0xFF:
- errorhandler("strict", 'latin1',
- 'ordinal not in range(256)', s,
- cur, cur + 1)
- j = rutf8.next_codepoint_pos(r, j)
- res.append(chr(c))
- cur = pos
+ for c in rutf8.Utf8StringIterator(r):
+ if c > 0xFF:
+ errorhandler("strict", 'latin1',
+ 'ordinal not in range(256)', s,
+ cur, cur + 1)
+ res.append(chr(c))
+
+ for j in range(pos - cur - 1):
+ iter.next()
+
+ cur = pos
r = res.build()
return r
@@ -210,7 +205,7 @@
if c > 0x7F:
errorhandler("strict", 'ascii',
'ordinal not in range(128)', utf8,
- pos, pos + 1)
+ pos, pos + 1)
j = rutf8.next_codepoint_pos(r, j)
pos = newpos
res.append(r)
@@ -341,8 +336,7 @@
res.append(r)
r = res.build()
- lgt, flag = rutf8.check_utf8(r, True)
- return r, pos, lgt, flag
+ return r, pos, rutf8.check_utf8(r, True)
hexdigits = "0123456789ABCDEFabcdef"
@@ -355,7 +349,7 @@
endinpos += 1
res, pos = errorhandler(errors, encoding,
message, s, pos-2, endinpos)
- size, flag = rutf8.check_utf8(res, True)
+ size = rutf8.check_utf8(res, True)
builder.append(res)
else:
try:
@@ -366,7 +360,7 @@
endinpos += 1
res, pos = errorhandler(errors, encoding,
message, s, pos-2, endinpos)
- size, flag = rutf8.check_utf8(res, True)
+ size = rutf8.check_utf8(res, True)
builder.append(res)
else:
# when we get here, chr is a 32-bit unicode character
@@ -376,21 +370,19 @@
message = "illegal Unicode character"
res, pos = errorhandler(errors, encoding,
message, s, pos-2, pos+digits)
- size, flag = rutf8.check_utf8(res, True)
+ size = rutf8.check_utf8(res, True)
builder.append(res)
else:
- flag = rutf8.get_flag_from_code(intmask(chr))
pos += digits
size = 1
- return pos, size, flag
+ return pos, size
def str_decode_unicode_escape(s, errors, final, errorhandler, ud_handler):
size = len(s)
if size == 0:
- return '', 0, 0, rutf8.FLAG_ASCII
+ return '', 0, 0
- flag = rutf8.FLAG_ASCII
builder = StringBuilder(size)
pos = 0
outsize = 0
@@ -401,7 +393,6 @@
if ch != '\\':
if ord(ch) > 0x7F:
rutf8.unichr_as_utf8_append(builder, ord(ch))
- flag = combine_flags(rutf8.FLAG_REGULAR, flag)
else:
builder.append(ch)
pos += 1
@@ -414,9 +405,8 @@
message = "\\ at end of string"
res, pos = errorhandler(errors, "unicodeescape",
message, s, pos-1, size)
- newsize, newflag = rutf8.check_utf8(res, True)
+ newsize = rutf8.check_utf8(res, True)
outsize + newsize
- flag = combine_flags(flag, newflag)
builder.append(res)
continue
@@ -469,7 +459,6 @@
outsize += 1
if x > 0x7F:
rutf8.unichr_as_utf8_append(builder, x)
- flag = combine_flags(rutf8.FLAG_REGULAR, flag)
else:
builder.append(chr(x))
# hex escapes
@@ -477,27 +466,24 @@
elif ch == 'x':
digits = 2
message = "truncated \\xXX escape"
- pos, newsize, newflag = hexescape(builder, s, pos, digits,
+ pos, newsize = hexescape(builder, s, pos, digits,
"unicodeescape", errorhandler, message, errors)
- flag = combine_flags(flag, newflag)
outsize += newsize
# \uXXXX
elif ch == 'u':
digits = 4
message = "truncated \\uXXXX escape"
- pos, newsize, newflag = hexescape(builder, s, pos, digits,
+ pos, newsize = hexescape(builder, s, pos, digits,
"unicodeescape", errorhandler, message, errors)
- flag = combine_flags(flag, newflag)
outsize += newsize
# \UXXXXXXXX
elif ch == 'U':
digits = 8
message = "truncated \\UXXXXXXXX escape"
- pos, newsize, newflag = hexescape(builder, s, pos, digits,
+ pos, newsize = hexescape(builder, s, pos, digits,
"unicodeescape", errorhandler, message, errors)
- flag = combine_flags(flag, newflag)
outsize += newsize
# \N{name}
@@ -517,29 +503,25 @@
if code < 0:
res, pos = errorhandler(errors, "unicodeescape",
message, s, pos-1, look+1)
- newsize, newflag = rutf8.check_utf8(res, True)
- flag = combine_flags(flag, newflag)
+ newsize = rutf8.check_utf8(res, True)
outsize += newsize
builder.append(res)
continue
pos = look + 1
outsize += 1
- flag = combine_flags(flag, rutf8.get_flag_from_code(code))
rutf8.unichr_as_utf8_append(builder, code,
allow_surrogates=True)
# xxx 'code' is probably always within range here...
else:
res, pos = errorhandler(errors, "unicodeescape",
message, s, pos-1, look+1)
- newsize, newflag = rutf8.check_utf8(res, True)
- flag = combine_flags(flag, newflag)
+ newsize = rutf8.check_utf8(res, True)
outsize += newsize
builder.append(res)
else:
res, pos = errorhandler(errors, "unicodeescape",
message, s, pos-1, look+1)
- newsize, newflag = rutf8.check_utf8(res, True)
- flag = combine_flags(flag, newflag)
+ newsize = rutf8.check_utf8(res, True)
outsize += newsize
builder.append(res)
else:
@@ -547,7 +529,20 @@
builder.append(ch)
outsize += 2
- return builder.build(), pos, outsize, flag
+ return builder.build(), pos, outsize
+
+def wcharpsize2utf8(space, wcharp, size):
+ """Safe version of rffi.wcharpsize2utf8.
+
+ Raises app-level ValueError if any wchar value is outside the valid
+ codepoint range.
+ """
+ try:
+ return rffi.wcharpsize2utf8(wcharp, size)
+ except ValueError:
+ raise oefmt(space.w_ValueError,
+ "character is not in range [U+0000; U+10ffff]")
+
# ____________________________________________________________
# Raw unicode escape
@@ -556,7 +551,7 @@
errorhandler=None):
size = len(s)
if size == 0:
- return '', 0, 0, rutf8.FLAG_ASCII
+ return '', 0, 0
result = StringBuilder(size)
pos = 0
@@ -594,12 +589,12 @@
digits = 4 if s[pos] == 'u' else 8
message = "truncated \\uXXXX"
pos += 1
- pos, _, _ = hexescape(result, s, pos, digits,
- "rawunicodeescape", errorhandler, message, errors)
+ pos, _ = hexescape(result, s, pos, digits,
+ "rawunicodeescape", errorhandler, message, errors)
r = result.build()
- lgt, flag = rutf8.check_utf8(r, True)
- return r, pos, lgt, flag
+ lgt = rutf8.check_utf8(r, True)
+ return r, pos, lgt
_utf8_encode_unicode_escape = rutf8.make_utf8_escape_function()
@@ -734,7 +729,7 @@
errorhandler=None):
size = len(s)
if size == 0:
- return '', 0, 0, rutf8.FLAG_ASCII
+ return '', 0, 0
inShift = False
base64bits = 0
@@ -745,7 +740,6 @@
result = StringBuilder(size)
pos = 0
shiftOutStartPos = 0
- flag = rutf8.FLAG_ASCII
startinpos = 0
while pos < size:
ch = s[pos]
@@ -771,13 +765,11 @@
(outCh & 0x3FF)) + 0x10000
rutf8.unichr_as_utf8_append(result, code)
outsize += 1
- flag = combine_flags(flag, rutf8.FLAG_REGULAR)
surrogate = 0
continue
else:
rutf8.unichr_as_utf8_append(result, surrogate,
allow_surrogates=True)
- flag = rutf8.FLAG_HAS_SURROGATES
outsize += 1
surrogate = 0
# Not done with outCh: falls back to next line
@@ -785,8 +777,6 @@
# first surrogate
surrogate = outCh
else:
- flag = combine_flags(flag,
- rutf8.get_flag_from_code(outCh))
outsize += 1
assert outCh >= 0
rutf8.unichr_as_utf8_append(result, outCh, True)
@@ -802,9 +792,8 @@
msg = "partial character in shift sequence"
res, pos = errorhandler(errors, 'utf7',
msg, s, pos-1, pos)
- reslen, resflags = rutf8.check_utf8(res, True)
+ reslen = rutf8.check_utf8(res, True)
outsize += reslen
- flag = combine_flags(flag, resflags)
result.append(res)
continue
else:
@@ -814,15 +803,13 @@
msg = "non-zero padding bits in shift sequence"
res, pos = errorhandler(errors, 'utf7',
msg, s, pos-1, pos)
- reslen, resflags = rutf8.check_utf8(res, True)
+ reslen = rutf8.check_utf8(res, True)
outsize += reslen
- flag = combine_flags(flag, resflags)
result.append(res)
continue
if surrogate and _utf7_DECODE_DIRECT(ord(ch)):
outsize += 1
- flag = rutf8.FLAG_HAS_SURROGATES
rutf8.unichr_as_utf8_append(result, surrogate, True)
surrogate = 0
@@ -854,9 +841,8 @@
pos += 1
msg = "unexpected special character"
res, pos = errorhandler(errors, 'utf7', msg, s, pos-1, pos)
- reslen, resflags = rutf8.check_utf8(res, True)
+ reslen = rutf8.check_utf8(res, True)
outsize += reslen
- flag = combine_flags(flag, resflags)
result.append(res)
# end of string
@@ -869,9 +855,8 @@
(base64bits > 0 and base64buffer != 0)):
msg = "unterminated shift sequence"
res, pos = errorhandler(errors, 'utf7', msg, s, shiftOutStartPos,
pos)
- reslen, resflags = rutf8.check_utf8(res, True)
+ reslen = rutf8.check_utf8(res, True)
outsize += reslen
- flag = combine_flags(flag, resflags)
result.append(res)
final_length = result.getlength()
elif inShift:
@@ -879,7 +864,7 @@
final_length = shiftOutStartPos # back off output
assert final_length >= 0
- return result.build()[:final_length], pos, outsize, flag
+ return result.build()[:final_length], pos, outsize
def utf8_encode_utf_7(s, errors, errorhandler):
size = len(s)
@@ -942,21 +927,21 @@
def str_decode_utf_16(s, errors, final=True,
errorhandler=None):
- result, c, lgt, flag, _ = str_decode_utf_16_helper(s, errors, final,
+ result, c, lgt, _ = str_decode_utf_16_helper(s, errors, final,
errorhandler,
"native")
- return result, c, lgt, flag
+ return result, c, lgt
def str_decode_utf_16_be(s, errors, final=True,
errorhandler=None):
- result, c, lgt, flag, _ = str_decode_utf_16_helper(s, errors, final,
+ result, c, lgt, _ = str_decode_utf_16_helper(s, errors, final,
errorhandler, "big")
- return result, c, lgt, flag
+ return result, c, lgt
def str_decode_utf_16_le(s, errors, final=True,
errorhandler=None):
- result, c, lgt, flag, _ = str_decode_utf_16_helper(s, errors, final,
+ result, c, lgt, _ = str_decode_utf_16_helper(s, errors, final,
errorhandler,
"little")
- return result, c, lgt, flag
+ return result, c, lgt
def str_decode_utf_16_helper(s, errors, final=True,
errorhandler=None,
@@ -999,7 +984,7 @@
else:
bo = 1
if size == 0:
- return '', 0, 0, rutf8.FLAG_ASCII, bo
+ return '', 0, 0, bo
if bo == -1:
# force little endian
ihi = 1
@@ -1058,8 +1043,8 @@
s, pos - 2, pos)
result.append(r)
r = result.build()
- lgt, flag = rutf8.check_utf8(r, True)
- return result.build(), pos, lgt, flag, bo
+ lgt = rutf8.check_utf8(r, True)
+ return result.build(), pos, lgt, bo
def _STORECHAR(result, CH, byteorder):
hi = chr(((CH) >> 8) & 0xff)
@@ -1148,21 +1133,21 @@
def str_decode_utf_32(s, errors, final=True,
errorhandler=None):
- result, c, lgt, flag, _ = str_decode_utf_32_helper(s, errors, final,
+ result, c, lgt, _ = str_decode_utf_32_helper(s, errors, final,
errorhandler,
"native")
- return result, c, lgt, flag
+ return result, c, lgt
def str_decode_utf_32_be(s, errors, final=True,
errorhandler=None):
- result, c, lgt, flag, _ = str_decode_utf_32_helper(s, errors, final,
+ result, c, lgt, _ = str_decode_utf_32_helper(s, errors, final,
errorhandler, "big")
- return result, c, lgt, flag
+ return result, c, lgt
def str_decode_utf_32_le(s, errors, final=True,
errorhandler=None):
- result, c, lgt, flag, _ = str_decode_utf_32_helper(s, errors, final,
+ result, c, lgt, _ = str_decode_utf_32_helper(s, errors, final,
errorhandler,
"little")
- return result, c, lgt, flag
+ return result, c, lgt
BOM32_DIRECT = intmask(0x0000FEFF)
BOM32_REVERSE = intmask(0xFFFE0000)
@@ -1208,7 +1193,7 @@
else:
bo = 1
if size == 0:
- return '', 0, 0, rutf8.FLAG_ASCII, bo
+ return '', 0, 0, bo
if bo == -1:
# force little endian
iorder = [0, 1, 2, 3]
@@ -1243,8 +1228,8 @@
rutf8.unichr_as_utf8_append(result, ch, allow_surrogates=True)
pos += 4
r = result.build()
- lgt, flag = rutf8.check_utf8(r, True)
- return r, pos, lgt, flag, bo
+ lgt = rutf8.check_utf8(r, True)
+ return r, pos, lgt, bo
def _STORECHAR32(result, CH, byteorder):
c0 = chr(((CH) >> 24) & 0xff)
@@ -1330,7 +1315,7 @@
errorhandler=None):
size = len(s)
if size == 0:
- return '', 0, 0, rutf8.FLAG_ASCII
+ return '', 0, 0
unicode_bytes = 4
if BYTEORDER == "little":
@@ -1367,8 +1352,8 @@
rutf8.unichr_as_utf8_append(result, intmask(t), allow_surrogates=True)
pos += unicode_bytes
r = result.build()
- lgt, flag = rutf8.check_utf8(r, True)
- return r, pos, lgt, flag
+ lgt = rutf8.check_utf8(r, True)
+ return r, pos, lgt
def utf8_encode_unicode_internal(s, errors, errorhandler):
size = len(s)
@@ -1409,7 +1394,7 @@
errorhandler=errorhandler)
size = len(s)
if size == 0:
- return '', 0, 0, rutf8.FLAG_ASCII
+ return '', 0, 0
pos = 0
result = StringBuilder(size)
@@ -1426,8 +1411,8 @@
result.append(c)
pos += 1
r = result.build()
- lgt, flag = rutf8.check_utf8(r, True)
- return r, pos, lgt, flag
+ lgt = rutf8.check_utf8(r, True)
+ return r, pos, lgt
def utf8_encode_charmap(s, errors, errorhandler=None,
mapping=None):
diff --git a/pypy/module/__builtin__/operation.py
b/pypy/module/__builtin__/operation.py
--- a/pypy/module/__builtin__/operation.py
+++ b/pypy/module/__builtin__/operation.py
@@ -26,14 +26,8 @@
"Return a Unicode string of one character with the given ordinal."
if code < 0 or code > 0x10FFFF:
raise oefmt(space.w_ValueError, "unichr() arg out of range")
- elif code < 0x80:
- flag = rutf8.FLAG_ASCII
- elif 0xD800 <= code <= 0xDFFF:
- flag = rutf8.FLAG_HAS_SURROGATES
- else:
- flag = rutf8.FLAG_REGULAR
s = rutf8.unichr_as_utf8(code, allow_surrogates=True)
- return space.newutf8(s, 1, flag)
+ return space.newutf8(s, 1)
def len(space, w_obj):
"len(object) -> integer\n\nReturn the number of items of a sequence or
mapping."
diff --git a/pypy/module/_cffi_backend/ctypeprim.py
b/pypy/module/_cffi_backend/ctypeprim.py
--- a/pypy/module/_cffi_backend/ctypeprim.py
+++ b/pypy/module/_cffi_backend/ctypeprim.py
@@ -183,8 +183,7 @@
raise oefmt(self.space.w_ValueError,
"%s out of range for conversion to unicode: %s",
self.name, s)
- flag = rutf8.get_flag_from_code(intmask(value))
- return self.space.newutf8(utf8, 1, flag)
+ return self.space.newutf8(utf8, 1)
def string(self, cdataobj, maxlen):
with cdataobj as ptr:
@@ -215,15 +214,15 @@
def unpack_ptr(self, w_ctypeptr, ptr, length):
if self.size == 2:
- utf8, lgt, flag = wchar_helper.utf8_from_char16(ptr, length)
+ utf8, lgt = wchar_helper.utf8_from_char16(ptr, length)
else:
try:
- utf8, lgt, flag = wchar_helper.utf8_from_char32(ptr, length)
+ utf8, lgt = wchar_helper.utf8_from_char32(ptr, length)
except wchar_helper.OutOfRange as e:
raise oefmt(self.space.w_ValueError,
"%s out of range for conversion to unicode: %s",
self.name, hex(e.ordinal))
- return self.space.newutf8(utf8, lgt, flag)
+ return self.space.newutf8(utf8, lgt)
class W_CTypePrimitiveSigned(W_CTypePrimitive):
diff --git a/pypy/module/_cffi_backend/wchar_helper.py
b/pypy/module/_cffi_backend/wchar_helper.py
--- a/pypy/module/_cffi_backend/wchar_helper.py
+++ b/pypy/module/_cffi_backend/wchar_helper.py
@@ -19,16 +19,14 @@
ptr = rffi.cast(rffi.UINTP, ptr)
u = StringBuilder(length)
j = 0
- flag = rutf8.FLAG_ASCII
while j < length:
ch = intmask(ptr[j])
j += 1
- flag = rutf8.combine_flags(flag, rutf8.get_flag_from_code(ch))
try:
rutf8.unichr_as_utf8_append(u, ch, allow_surrogates=True)
except ValueError:
raise OutOfRange(ch)
- return u.build(), length, flag
+ return u.build(), length
def utf8_from_char16(ptr, length):
# 'ptr' is a pointer to 'length' 16-bit integers
@@ -36,7 +34,6 @@
u = StringBuilder(length)
j = 0
result_length = length
- flag = rutf8.FLAG_ASCII
while j < length:
ch = intmask(ptr[j])
j += 1
@@ -46,9 +43,8 @@
ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000
j += 1
result_length -= 1
- flag = rutf8.combine_flags(flag, rutf8.get_flag_from_code(ch))
rutf8.unichr_as_utf8_append(u, ch, allow_surrogates=True)
- return u.build(), result_length, flag
+ return u.build(), result_length
@specialize.ll()
diff --git a/pypy/module/_codecs/interp_codecs.py
b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -43,8 +43,8 @@
length = len(input)
else:
w_cls = space.w_UnicodeEncodeError
- length, flag = rutf8.check_utf8(input, allow_surrogates=True)
- w_input = space.newutf8(input, length, flag)
+ length = rutf8.check_utf8(input, allow_surrogates=True)
+ w_input = space.newutf8(input, length)
w_exc = space.call_function(
w_cls,
space.newtext(encoding),
@@ -192,7 +192,7 @@
def ignore_errors(space, w_exc):
check_exception(space, w_exc)
w_end = space.getattr(w_exc, space.newtext('end'))
- return space.newtuple([space.newutf8('', 0, rutf8.FLAG_ASCII), w_end])
+ return space.newtuple([space.newutf8('', 0), w_end])
REPLACEMENT = u'\ufffd'.encode('utf8')
@@ -203,13 +203,13 @@
size = space.int_w(w_end) - space.int_w(w_start)
if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
text = '?' * size
- return space.newtuple([space.newutf8(text, size, rutf8.FLAG_ASCII),
w_end])
+ return space.newtuple([space.newutf8(text, size), w_end])
elif space.isinstance_w(w_exc, space.w_UnicodeDecodeError):
text = REPLACEMENT
- return space.newtuple([space.newutf8(text, 1, rutf8.FLAG_REGULAR),
w_end])
+ return space.newtuple([space.newutf8(text, 1), w_end])
elif space.isinstance_w(w_exc, space.w_UnicodeTranslateError):
text = REPLACEMENT * size
- return space.newtuple([space.newutf8(text, size, rutf8.FLAG_REGULAR),
w_end])
+ return space.newtuple([space.newutf8(text, size), w_end])
else:
raise oefmt(space.w_TypeError,
"don't know how to handle %T in error callback", w_exc)
@@ -237,8 +237,8 @@
builder.append(";")
pos = rutf8.next_codepoint_pos(obj, pos)
r = builder.build()
- lgt, flag = rutf8.check_utf8(r, True)
- return space.newtuple([space.newutf8(r, lgt, flag), w_end])
+ lgt = rutf8.check_utf8(r, True)
+ return space.newtuple([space.newutf8(r, lgt), w_end])
else:
raise oefmt(space.w_TypeError,
"don't know how to handle %T in error callback", w_exc)
@@ -278,8 +278,8 @@
builder.append_slice(num, 2, lnum)
pos = rutf8.next_codepoint_pos(obj, pos)
r = builder.build()
- lgt, flag = rutf8.check_utf8(r, True)
- return space.newtuple([space.newutf8(r, lgt, flag), w_end])
+ lgt = rutf8.check_utf8(r, True)
+ return space.newtuple([space.newutf8(r, lgt), w_end])
else:
raise oefmt(space.w_TypeError,
"don't know how to handle %T in error callback", w_exc)
@@ -417,9 +417,9 @@
final = space.is_true(w_final)
state = space.fromcache(CodecState)
func = getattr(unicodehelper, rname)
- result, consumed, length, flag = func(string, errors,
+ result, consumed, length = func(string, errors,
final,
state.decode_error_handler)
- return space.newtuple([space.newutf8(result, length, flag),
+ return space.newtuple([space.newutf8(result, length),
space.newint(consumed)])
wrap_decoder.func_name = rname
globals()[name] = wrap_decoder
@@ -460,22 +460,12 @@
# utf-8 functions are not regular, because we have to pass
# "allow_surrogates=True"
-@unwrap_spec(utf8='utf8', errors='text_or_none')
-def utf_8_encode(space, utf8, errors="strict"):
- length, _ = rutf8.check_utf8(utf8, allow_surrogates=True)
- return space.newtuple([space.newbytes(utf8), space.newint(length)])
-#@unwrap_spec(uni=unicode, errors='text_or_none')
-#def utf_8_encode(space, uni, errors="strict"):
-# if errors is None:
-# errors = 'strict'
-# state = space.fromcache(CodecState)
-# # NB. can't call unicode_encode_utf_8() directly because that's
-# # an @elidable function nowadays. Instead, we need the _impl().
-# # (The problem is the errorhandler, which calls arbitrary Python.)
-# result = runicode.unicode_encode_utf_8_impl(
-# uni, len(uni), errors, state.encode_error_handler,
-# allow_surrogates=True)
-# return space.newtuple([space.newbytes(result), space.newint(len(uni))])
+@unwrap_spec(errors='text_or_none')
+def utf_8_encode(space, w_obj, errors="strict"):
+ utf8, lgt = space.utf8_len_w(w_obj)
+ if rutf8.has_surrogates(utf8):
+ utf8 = rutf8.reencode_utf8_with_surrogates(utf8)
+ return space.newtuple([space.newbytes(utf8), space.newint(lgt)])
@unwrap_spec(string='bufferstr', errors='text_or_none',
w_final = WrappedDefault(False))
@@ -488,14 +478,14 @@
state = space.fromcache(CodecState)
# call the fast version for checking
try:
- lgt, flag = rutf8.check_utf8(string, allow_surrogates=True)
+ lgt = rutf8.check_utf8(string, allow_surrogates=True)
except rutf8.CheckError:
- res, consumed, lgt, flag = unicodehelper.str_decode_utf8(string,
+ res, consumed, lgt = unicodehelper.str_decode_utf8(string,
errors, final, state.decode_error_handler)
- return space.newtuple([space.newutf8(res, lgt, flag),
+ return space.newtuple([space.newutf8(res, lgt),
space.newint(consumed)])
else:
- return space.newtuple([space.newutf8(string, lgt, flag),
+ return space.newtuple([space.newutf8(string, lgt),
space.newint(len(string))])
@unwrap_spec(data='bufferstr', errors='text_or_none', byteorder=int,
@@ -516,10 +506,10 @@
consumed = len(data)
if final:
consumed = 0
- res, consumed, lgt, flag, byteorder = str_decode_utf_16_helper(
+ res, consumed, lgt, byteorder = str_decode_utf_16_helper(
data, errors, final,
state.decode_error_handler, byteorder)
- return space.newtuple([space.newutf8(res, lgt, flag),
+ return space.newtuple([space.newutf8(res, lgt),
space.newint(consumed),
space.newint(byteorder)])
@@ -539,10 +529,10 @@
consumed = len(data)
if final:
consumed = 0
- res, consumed, lgt, flag, byteorder = str_decode_utf_32_helper(
+ res, consumed, lgt, byteorder = str_decode_utf_32_helper(
data, errors, final,
state.decode_error_handler, byteorder)
- return space.newtuple([space.newutf8(res, lgt, flag),
+ return space.newtuple([space.newutf8(res, lgt),
space.newint(consumed),
space.newint(byteorder)])
@@ -632,7 +622,7 @@
if errors is None:
errors = 'strict'
if len(string) == 0:
- return space.newtuple([space.newutf8('', 0, rutf8.FLAG_ASCII),
+ return space.newtuple([space.newutf8('', 0),
space.newint(0)])
if space.is_none(w_mapping):
@@ -642,9 +632,9 @@
final = True
state = space.fromcache(CodecState)
- result, consumed, lgt, flag = unicodehelper.str_decode_charmap(
+ result, consumed, lgt = unicodehelper.str_decode_charmap(
string, errors, final, state.decode_error_handler, mapping)
- return space.newtuple([space.newutf8(result, lgt, flag),
+ return space.newtuple([space.newutf8(result, lgt),
space.newint(consumed)])
@unwrap_spec(errors='text_or_none')
@@ -708,12 +698,12 @@
unicode_name_handler = state.get_unicodedata_handler(space)
- result, consumed, lgt, flag = unicodehelper.str_decode_unicode_escape(
+ result, consumed, lgt = unicodehelper.str_decode_unicode_escape(
string, errors,
final, state.decode_error_handler,
unicode_name_handler)
- return space.newtuple([space.newutf8(result, lgt, flag),
space.newint(consumed)])
+ return space.newtuple([space.newutf8(result, lgt), space.newint(consumed)])
# ____________________________________________________________
# Unicode-internal
@@ -731,15 +721,15 @@
string = space.readbuf_w(w_string).as_str()
if len(string) == 0:
- return space.newtuple([space.newutf8('', 0, rutf8.FLAG_ASCII),
+ return space.newtuple([space.newutf8('', 0),
space.newint(0)])
final = True
state = space.fromcache(CodecState)
- result, consumed, lgt, flag = unicodehelper.str_decode_unicode_internal(
+ result, consumed, lgt = unicodehelper.str_decode_unicode_internal(
string, errors,
final, state.decode_error_handler)
- return space.newtuple([space.newutf8(result, lgt, flag),
+ return space.newtuple([space.newutf8(result, lgt),
space.newint(consumed)])
# ____________________________________________________________
diff --git a/pypy/module/_io/interp_stringio.py
b/pypy/module/_io/interp_stringio.py
--- a/pypy/module/_io/interp_stringio.py
+++ b/pypy/module/_io/interp_stringio.py
@@ -1,3 +1,5 @@
+from rpython.rlib.rutf8 import get_utf8_length
+
from pypy.interpreter.error import OperationError, oefmt
from pypy.interpreter.typedef import (
TypeDef, generic_new_descr, GetSetProperty)
@@ -152,7 +154,7 @@
if self.readnl is None:
w_readnl = space.w_None
else:
- w_readnl = space.str(space.new_from_utf8(self.readnl)) # YYY
+ w_readnl = space.str(space.newutf8(self.readnl,
get_utf8_length(self.readnl))) # YYY
return space.newtuple([
w_initialval, w_readnl, space.newint(self.buf.pos), w_dict
])
@@ -215,7 +217,8 @@
if self.writenl:
w_decoded = space.call_method(
w_decoded, "replace",
- space.newtext("\n"), space.new_from_utf8(self.writenl))
+ space.newtext("\n"), space.newutf8(self.writenl,
+ get_utf8_length(self.writenl)))
string = space.utf8_w(w_decoded)
if string:
self.buf.write(string)
@@ -225,7 +228,9 @@
def read_w(self, space, w_size=None):
self._check_closed(space)
size = convert_size(space, w_size)
- return space.new_from_utf8(self.buf.read(size))
+ v = self.buf.read(size)
+ lgt = get_utf8_length(v)
+ return space.newutf8(v, lgt)
def readline_w(self, space, w_limit=None):
self._check_closed(space)
@@ -239,7 +244,8 @@
else:
newline = self.readnl
result = self.buf.readline(newline, limit)
- return space.new_from_utf8(result)
+ resultlen = get_utf8_length(result)
+ return space.newutf8(result, resultlen)
@unwrap_spec(pos=int, mode=int)
@@ -276,7 +282,9 @@
def getvalue_w(self, space):
self._check_closed(space)
- return space.new_from_utf8(self.buf.getvalue())
+ v = self.buf.getvalue()
+ lgt = get_utf8_length(v)
+ return space.newutf8(v, lgt)
def readable_w(self, space):
self._check_closed(space)
diff --git a/pypy/module/_io/interp_textio.py b/pypy/module/_io/interp_textio.py
--- a/pypy/module/_io/interp_textio.py
+++ b/pypy/module/_io/interp_textio.py
@@ -11,8 +11,9 @@
from rpython.rlib.rarithmetic import intmask, r_uint, r_ulonglong
from rpython.rlib.rbigint import rbigint
from rpython.rlib.rstring import StringBuilder
-from rpython.rlib.rutf8 import (
- FLAG_ASCII, check_utf8, next_codepoint_pos, codepoints_in_utf8)
+from rpython.rlib.rutf8 import (check_utf8, next_codepoint_pos,
+ codepoints_in_utf8, get_utf8_length,
+ Utf8StringBuilder)
STATE_ZERO, STATE_OK, STATE_DETACHED = range(3)
@@ -31,22 +32,22 @@
def __init__(self, space):
self.w_newlines_dict = {
- SEEN_CR: space.newutf8("\r", 1, FLAG_ASCII),
- SEEN_LF: space.newutf8("\n", 1, FLAG_ASCII),
- SEEN_CRLF: space.newutf8("\r\n", 2, FLAG_ASCII),
+ SEEN_CR: space.newutf8("\r", 1),
+ SEEN_LF: space.newutf8("\n", 1),
+ SEEN_CRLF: space.newutf8("\r\n", 2),
SEEN_CR | SEEN_LF: space.newtuple(
- [space.newutf8("\r", 1, FLAG_ASCII),
- space.newutf8("\n", 1, FLAG_ASCII)]),
+ [space.newutf8("\r", 1),
+ space.newutf8("\n", 1)]),
SEEN_CR | SEEN_CRLF: space.newtuple(
- [space.newutf8("\r", 1, FLAG_ASCII),
- space.newutf8("\r\n", 2, FLAG_ASCII)]),
+ [space.newutf8("\r", 1),
+ space.newutf8("\r\n", 2)]),
SEEN_LF | SEEN_CRLF: space.newtuple(
- [space.newutf8("\n", 1, FLAG_ASCII),
- space.newutf8("\r\n", 2, FLAG_ASCII)]),
+ [space.newutf8("\n", 1),
+ space.newutf8("\r\n", 2)]),
SEEN_CR | SEEN_LF | SEEN_CRLF: space.newtuple(
- [space.newutf8("\r", 1, FLAG_ASCII),
- space.newutf8("\n", 1, FLAG_ASCII),
- space.newutf8("\r\n", 2, FLAG_ASCII)]),
+ [space.newutf8("\r", 1),
+ space.newutf8("\n", 1),
+ space.newutf8("\r\n", 2)]),
}
@unwrap_spec(translate=int)
@@ -98,7 +99,7 @@
output_len -= 1
if output_len == 0:
- return space.newutf8("", 0, FLAG_ASCII)
+ return space.newutf8("", 0)
# Record which newlines are read and do newline translation if
# desired, all in one pass.
@@ -153,8 +154,8 @@
output = builder.build()
self.seennl |= seennl
- lgt, flag = check_utf8(output, True)
- return space.newutf8(output, lgt, flag)
+ lgt = check_utf8(output, True)
+ return space.newutf8(output, lgt)
def reset_w(self, space):
self.seennl = 0
@@ -684,13 +685,15 @@
w_bytes = space.call_method(self.w_buffer, "read")
w_decoded = space.call_method(self.w_decoder, "decode", w_bytes,
space.w_True)
check_decoded(space, w_decoded)
- w_result = space.new_from_utf8(self.decoded.get_chars(-1))
+ chars = self.decoded.get_chars(-1)
+ lgt = get_utf8_length(chars)
+ w_result = space.newutf8(chars, lgt)
w_final = space.add(w_result, w_decoded)
self.snapshot = None
return w_final
remaining = size
- builder = StringBuilder(size)
+ builder = Utf8StringBuilder(size)
# Keep reading chunks until we have n characters to return
while remaining > 0:
@@ -700,7 +703,7 @@
builder.append(data)
remaining -= len(data)
- return space.new_from_utf8(builder.build())
+ return space.newutf8(builder.build(), builder.get_length())
def _scan_line_ending(self, limit):
if self.readuniversal:
@@ -725,6 +728,7 @@
limit = convert_size(space, w_limit)
remnant = None
builder = StringBuilder()
+ # XXX maybe use Utf8StringBuilder instead?
while True:
# First, get some data if necessary
has_data = self._ensure_data(space)
@@ -771,7 +775,8 @@
self.decoded.reset()
result = builder.build()
- return space.new_from_utf8(result)
+ lgt = get_utf8_length(result)
+ return space.newutf8(result, lgt)
# _____________________________________________________________
# write methods
@@ -794,8 +799,8 @@
if text.find('\n') >= 0:
haslf = True
if haslf and self.writetranslate and self.writenl:
- w_text = space.call_method(w_text, "replace",
space.new_from_utf8('\n'),
- space.new_from_utf8(self.writenl))
+ w_text = space.call_method(w_text, "replace", space.newutf8('\n',
1),
+ space.newutf8(self.writenl,
get_utf8_length(self.writenl)))
text = space.utf8_w(w_text)
needflush = False
diff --git a/pypy/module/_multibytecodec/c_codecs.py
b/pypy/module/_multibytecodec/c_codecs.py
--- a/pypy/module/_multibytecodec/c_codecs.py
+++ b/pypy/module/_multibytecodec/c_codecs.py
@@ -157,7 +157,7 @@
replace, end = errorcb(errors, namecb, reason,
stringdata, start, end)
# 'replace' is RPython unicode here
- lgt, _ = rutf8.check_utf8(replace, True)
+ lgt = rutf8.get_utf8_length(replace)
inbuf = rffi.utf82wcharp(replace, lgt)
try:
r = pypy_cjk_dec_replace_on_error(decodebuf, inbuf, lgt, end)
@@ -268,7 +268,7 @@
rets, end = errorcb(errors, namecb, reason,
unicodedata, start, end)
codec = pypy_cjk_enc_getcodec(encodebuf)
- lgt, _ = rutf8.get_utf8_length_flag(rets)
+ lgt = rutf8.get_utf8_length(rets)
replace = encode(codec, rets, lgt, "strict", errorcb, namecb)
with rffi.scoped_nonmovingbuffer(replace) as inbuf:
r = pypy_cjk_enc_replace_on_error(encodebuf, inbuf, len(replace), end)
diff --git a/pypy/module/_multibytecodec/interp_incremental.py
b/pypy/module/_multibytecodec/interp_incremental.py
--- a/pypy/module/_multibytecodec/interp_incremental.py
+++ b/pypy/module/_multibytecodec/interp_incremental.py
@@ -66,8 +66,8 @@
pos = c_codecs.pypy_cjk_dec_inbuf_consumed(self.decodebuf)
assert 0 <= pos <= len(object)
self.pending = object[pos:]
- lgt, flag = rutf8.get_utf8_length_flag(output)
- return space.newutf8(output, lgt, flag)
+ lgt = rutf8.get_utf8_length(output)
+ return space.newutf8(output, lgt)
@unwrap_spec(errors="text_or_none")
diff --git a/pypy/module/_multibytecodec/interp_multibytecodec.py
b/pypy/module/_multibytecodec/interp_multibytecodec.py
--- a/pypy/module/_multibytecodec/interp_multibytecodec.py
+++ b/pypy/module/_multibytecodec/interp_multibytecodec.py
@@ -27,8 +27,8 @@
raise wrap_unicodedecodeerror(space, e, input, self.name)
except RuntimeError:
raise wrap_runtimeerror(space)
- lgt, flag = rutf8.check_utf8(utf8_output, True)
- return space.newtuple([space.newutf8(utf8_output, lgt, flag),
+ lgt = rutf8.get_utf8_length(utf8_output)
+ return space.newtuple([space.newutf8(utf8_output, lgt),
space.newint(len(input))])
@unwrap_spec(errors="text_or_none")
@@ -78,12 +78,11 @@
space.newtext(e.reason)]))
def wrap_unicodeencodeerror(space, e, input, inputlen, name):
- _, flag = rutf8.check_utf8(input, True)
raise OperationError(
space.w_UnicodeEncodeError,
space.newtuple([
space.newtext(name),
- space.newutf8(input, inputlen, flag),
+ space.newutf8(input, inputlen),
space.newint(e.start),
space.newint(e.end),
space.newtext(e.reason)]))
diff --git a/pypy/module/_multibytecodec/test/test_translation.py
b/pypy/module/_multibytecodec/test/test_translation.py
--- a/pypy/module/_multibytecodec/test/test_translation.py
+++ b/pypy/module/_multibytecodec/test/test_translation.py
@@ -14,7 +14,7 @@
codecname, string = argv[1], argv[2]
c = c_codecs.getcodec(codecname)
u = c_codecs.decode(c, string)
- lgt, _ = rutf8.get_utf8_length_flag(u)
+ lgt = rutf8.get_utf8_length(u)
r = c_codecs.encode(c, u, lgt)
print r
return 0
diff --git a/pypy/module/_pypyjson/interp_decoder.py
b/pypy/module/_pypyjson/interp_decoder.py
--- a/pypy/module/_pypyjson/interp_decoder.py
+++ b/pypy/module/_pypyjson/interp_decoder.py
@@ -295,15 +295,15 @@
if bits & 0x80:
# the 8th bit is set, it's an utf8 string
content_utf8 = self.getslice(start, end)
- lgt, flag = unicodehelper.check_utf8_or_raise(self.space,
+ lgt = unicodehelper.check_utf8_or_raise(self.space,
content_utf8)
- return self.space.newutf8(content_utf8, lgt, flag)
+ return self.space.newutf8(content_utf8, lgt)
else:
# ascii only, fast path (ascii is a strict subset of
# latin1, and we already checked that all the chars are <
# 128)
return self.space.newutf8(self.getslice(start, end),
- end - start, rutf8.FLAG_ASCII)
+ end - start)
def decode_string_escaped(self, start):
i = self.pos
@@ -316,10 +316,10 @@
i += 1
if ch == '"':
content_utf8 = builder.build()
- lgt, f = unicodehelper.check_utf8_or_raise(self.space,
+ lgt = unicodehelper.check_utf8_or_raise(self.space,
content_utf8)
self.pos = i
- return self.space.newutf8(content_utf8, lgt, f)
+ return self.space.newutf8(content_utf8, lgt)
elif ch == '\\':
i = self.decode_escape_sequence(i, builder)
elif ch < '\x20':
diff --git a/pypy/module/_pypyjson/test/test__pypyjson.py
b/pypy/module/_pypyjson/test/test__pypyjson.py
--- a/pypy/module/_pypyjson/test/test__pypyjson.py
+++ b/pypy/module/_pypyjson/test/test__pypyjson.py
@@ -11,7 +11,7 @@
dec.close()
class FakeSpace(object):
- def newutf8(self, s, l, f):
+ def newutf8(self, s, l):
return s
def test_decode_key():
diff --git a/pypy/module/_rawffi/alt/type_converter.py
b/pypy/module/_rawffi/alt/type_converter.py
--- a/pypy/module/_rawffi/alt/type_converter.py
+++ b/pypy/module/_rawffi/alt/type_converter.py
@@ -228,8 +228,7 @@
return space.newbytes(chr(ucharval))
elif w_ffitype.is_unichar():
wcharval = self.get_unichar(w_ffitype)
- return space.newutf8(rutf8.unichr_as_utf8(wcharval), 1,
- rutf8.get_flag_from_code(intmask(wcharval)))
+ return space.newutf8(rutf8.unichr_as_utf8(wcharval), 1)
elif w_ffitype.is_double():
return self._float(w_ffitype)
elif w_ffitype.is_singlefloat():
diff --git a/pypy/module/_rawffi/interp_rawffi.py
b/pypy/module/_rawffi/interp_rawffi.py
--- a/pypy/module/_rawffi/interp_rawffi.py
+++ b/pypy/module/_rawffi/interp_rawffi.py
@@ -596,9 +596,9 @@
return space.w_None
wcharp_addr = rffi.cast(rffi.CWCHARP, address)
if maxlength == -1:
- s = rffi.wcharp2unicode(wcharp_addr)
+ s = rffi.wcharp2utf8(wcharp_addr)
else:
- s = rffi.wcharp2unicoden(wcharp_addr, maxlength)
+ s = rffi.wcharpsize2utf8(wcharp_addr, maxlength)
return space.newunicode(s)
@unwrap_spec(address=r_uint, maxlength=int)
diff --git a/pypy/module/_sre/interp_sre.py b/pypy/module/_sre/interp_sre.py
--- a/pypy/module/_sre/interp_sre.py
+++ b/pypy/module/_sre/interp_sre.py
@@ -6,7 +6,7 @@
from pypy.interpreter.gateway import interp2app, unwrap_spec, WrappedDefault
from pypy.interpreter.error import OperationError, oefmt
from rpython.rlib.rarithmetic import intmask
-from rpython.rlib import jit
+from rpython.rlib import jit, rutf8
from rpython.rlib.rstring import StringBuilder
from rpython.rlib.rutf8 import Utf8StringBuilder
@@ -42,7 +42,9 @@
if isinstance(ctx, rsre_core.StrMatchContext):
return space.newbytes(ctx._string[start:end])
elif isinstance(ctx, rsre_core.UnicodeMatchContext):
- return space.newunicode(ctx._unicodestr[start:end])
+ s = ctx._unicodestr[start:end]
+ lgt = rutf8.check_utf8(s, True)
+ return space.newutf8(s, lgt)
else:
# unreachable
raise SystemError
@@ -110,7 +112,9 @@
if endpos < pos:
endpos = pos
if space.isinstance_w(w_string, space.w_unicode):
- unicodestr = space.unicode_w(w_string)
+ unicodestr = space.utf8_w(w_string)
+ # XXX will fail some tests, the length need to be adjusted for
+ # real char len etc
if pos > len(unicodestr):
pos = len(unicodestr)
if endpos > len(unicodestr):
@@ -337,11 +341,10 @@
else:
assert unicodebuilder is not None
return space.newutf8(unicodebuilder.build(),
- unicodebuilder.get_length(),
- unicodebuilder.get_flag()), n
+ unicodebuilder.get_length()), n
else:
if space.isinstance_w(w_string, space.w_unicode):
- w_emptystr = space.newunicode(u'')
+ w_emptystr = space.newutf8('', 0)
else:
w_emptystr = space.newbytes('')
w_item = space.call_method(w_emptystr, 'join',
@@ -575,7 +578,8 @@
elif isinstance(ctx, rsre_core.StrMatchContext):
return space.newbytes(ctx._string)
elif isinstance(ctx, rsre_core.UnicodeMatchContext):
- return space.newunicode(ctx._unicodestr)
+ lgt = rutf8.check_utf8(ctx._unicodestr, True)
+ return space.newutf8(ctx._unicodestr, lgt)
else:
raise SystemError
diff --git a/pypy/module/_warnings/interp_warnings.py
b/pypy/module/_warnings/interp_warnings.py
--- a/pypy/module/_warnings/interp_warnings.py
+++ b/pypy/module/_warnings/interp_warnings.py
@@ -1,3 +1,6 @@
+
+from rpython.rlib import rutf8
+
from pypy.interpreter.gateway import unwrap_spec, WrappedDefault
from pypy.interpreter.error import OperationError, oefmt
@@ -208,10 +211,11 @@
except OperationError as e:
if e.async(space):
raise
- message = u"%s:%d: %s: %s\n" % (space.unicode_w(w_filename), lineno,
- space.unicode_w(w_name),
- space.unicode_w(w_text))
- w_message = space.newunicode(message)
+ message = "%s:%d: %s: %s\n" % (space.utf8_w(w_filename), lineno,
+ space.utf8_w(w_name),
+ space.utf8_w(w_text))
+ lgt = rutf8.check_utf8(message, True)
+ w_message = space.newutf8(message, lgt)
else:
w_message = space.newtext(message)
space.call_method(w_stderr, "write", w_message)
diff --git a/pypy/module/array/interp_array.py
b/pypy/module/array/interp_array.py
--- a/pypy/module/array/interp_array.py
+++ b/pypy/module/array/interp_array.py
@@ -1,4 +1,4 @@
-from rpython.rlib import jit, rgc
+from rpython.rlib import jit, rgc, rutf8
from rpython.rlib.buffer import RawBuffer
from rpython.rlib.objectmodel import keepalive_until_here
from rpython.rlib.rarithmetic import ovfcheck, widen
@@ -451,7 +451,7 @@
"""
if self.typecode == 'u':
buf = rffi.cast(UNICODE_ARRAY, self._buffer_as_unsigned())
- return space.newunicode(rffi.wcharpsize2unicode(buf, self.len))
+ return space.newutf8(rffi.wcharpsize2utf8(buf, self.len), self.len)
else:
raise oefmt(space.w_ValueError,
"tounicode() may only be called on type 'u' arrays")
@@ -797,7 +797,7 @@
TypeCode(rffi.UINT, 'int_w', True)
types = {
'c': TypeCode(lltype.Char, 'bytes_w', method=''),
- 'u': TypeCode(lltype.UniChar, 'unicode_w', method=''),
+ 'u': TypeCode(lltype.UniChar, 'utf8_len_w', method=''),
'b': TypeCode(rffi.SIGNEDCHAR, 'int_w', True, True),
'B': TypeCode(rffi.UCHAR, 'int_w', True),
'h': TypeCode(rffi.SHORT, 'int_w', True, True),
@@ -895,11 +895,17 @@
"unsigned %d-byte integer out of range",
mytype.bytes)
return rffi.cast(mytype.itemtype, item)
- if mytype.unwrap == 'bytes_w' or mytype.unwrap == 'unicode_w':
+ if mytype.unwrap == 'bytes_w':
if len(item) != 1:
raise oefmt(space.w_TypeError, "array item must be char")
item = item[0]
return rffi.cast(mytype.itemtype, item)
+ if mytype.unwrap == 'utf8_len_w':
+ utf8, lgt = item
+ if lgt != 1:
+ raise oefmt(space.w_TypeError, "array item must be char")
+ uchar = rutf8.codepoint_at_pos(utf8, 0)
+ return rffi.cast(mytype.itemtype, uchar)
#
# "regular" case: it fits in an rpython integer (lltype.Signed)
# or it is a float
@@ -1007,7 +1013,8 @@
elif mytype.typecode == 'c':
return space.newbytes(item)
elif mytype.typecode == 'u':
- return space.newunicode(item)
+ code = ord(item)
+ return space.newutf8(rutf8.unichr_as_utf8(code), 1)
assert 0, "unreachable"
# interface
diff --git a/pypy/module/cpyext/unicodeobject.py
b/pypy/module/cpyext/unicodeobject.py
--- a/pypy/module/cpyext/unicodeobject.py
+++ b/pypy/module/cpyext/unicodeobject.py
@@ -1,5 +1,9 @@
+from rpython.rtyper.lltypesystem import rffi, lltype
+from rpython.rlib import rstring, runicode
+from rpython.tool.sourcetools import func_renamer
+
from pypy.interpreter.error import OperationError, oefmt
-from rpython.rtyper.lltypesystem import rffi, lltype
+from pypy.interpreter.unicodehelper import wcharpsize2utf8
from pypy.module.unicodedata import unicodedb
from pypy.module.cpyext.api import (
CANNOT_FAIL, Py_ssize_t, build_type_checkers_flags, cpython_api,
@@ -13,8 +17,6 @@
from pypy.module.sys.interp_encoding import setdefaultencoding
from pypy.module._codecs.interp_codecs import CodecState
from pypy.objspace.std import unicodeobject
-from rpython.rlib import rstring, runicode
-from rpython.tool.sourcetools import func_renamer
import sys
## See comment in bytesobject.py.
@@ -61,10 +63,10 @@
def unicode_attach(space, py_obj, w_obj, w_userdata=None):
"Fills a newly allocated PyUnicodeObject with a unicode string"
py_unicode = rffi.cast(PyUnicodeObject, py_obj)
- s = space.unicode_w(w_obj)
- py_unicode.c_length = len(s)
+ s, length = space.utf8_len_w(w_obj)
+ py_unicode.c_length = length
py_unicode.c_str = lltype.nullptr(rffi.CWCHARP.TO)
- py_unicode.c_hash = space.hash_w(space.newunicode(s))
+ py_unicode.c_hash = space.hash_w(space.newutf8(s, length))
py_unicode.c_defenc = lltype.nullptr(PyObject.TO)
def unicode_realize(space, py_obj):
@@ -73,11 +75,12 @@
be modified after this call.
"""
py_uni = rffi.cast(PyUnicodeObject, py_obj)
- s = rffi.wcharpsize2unicode(py_uni.c_str, py_uni.c_length)
+ length = py_uni.c_length
+ s = wcharpsize2utf8(space, py_uni.c_str, length)
w_type = from_ref(space, rffi.cast(PyObject, py_obj.c_ob_type))
w_obj = space.allocate_instance(unicodeobject.W_UnicodeObject, w_type)
- w_obj.__init__(s)
- py_uni.c_hash = space.hash_w(space.newunicode(s))
+ w_obj.__init__(s, length)
+ py_uni.c_hash = space.hash_w(space.newutf8(s, length))
track_reference(space, py_obj, w_obj)
return w_obj
@@ -214,8 +217,8 @@
if not ref_unicode.c_str:
# Copy unicode buffer
w_unicode = from_ref(space, rffi.cast(PyObject, ref))
- u = space.unicode_w(w_unicode)
- ref_unicode.c_str = rffi.unicode2wcharp(u)
+ u, length = space.utf8_len_w(w_unicode)
+ ref_unicode.c_str = rffi.utf82wcharp(u, length)
return ref_unicode.c_str
@cpython_api([PyObject], rffi.CWCHARP)
@@ -335,8 +338,8 @@
Therefore, modification of the resulting Unicode object is only allowed
when u
is NULL."""
if wchar_p:
- s = rffi.wcharpsize2unicode(wchar_p, length)
- return make_ref(space, space.newunicode(s))
+ s = wcharpsize2utf8(space, wchar_p, length)
+ return make_ref(space, space.newutf8(s, length))
else:
return rffi.cast(PyObject, new_empty_unicode(space, length))
@@ -506,7 +509,8 @@
"""Encode the Py_UNICODE buffer of the given size and return a
Python string object. Return NULL if an exception was raised
by the codec."""
- w_u = space.newunicode(rffi.wcharpsize2unicode(s, size))
+ u = wcharpsize2utf8(space, s, size)
+ w_u = space.newutf8(u, size)
if errors:
w_errors = space.newtext(rffi.charp2str(errors))
else:
@@ -706,12 +710,12 @@
"""Return 1 if substr matches str[start:end] at the given tail end
(direction == -1 means to do a prefix match, direction == 1 a
suffix match), 0 otherwise. Return -1 if an error occurred."""
- str = space.unicode_w(w_str)
- substr = space.unicode_w(w_substr)
+ w_start = space.newint(start)
+ w_end = space.newint(end)
if rffi.cast(lltype.Signed, direction) <= 0:
- return rstring.startswith(str, substr, start, end)
+ return space.call_method(w_str, "startswith", w_substr, w_start, w_end)
else:
- return rstring.endswith(str, substr, start, end)
+ return space.call_method(w_str, "endswith", w_substr, w_start, w_end)
@cpython_api([PyObject, PyObject, Py_ssize_t, Py_ssize_t], Py_ssize_t,
error=-1)
def PyUnicode_Count(space, w_str, w_substr, start, end):
diff --git a/pypy/module/exceptions/interp_exceptions.py
b/pypy/module/exceptions/interp_exceptions.py
--- a/pypy/module/exceptions/interp_exceptions.py
+++ b/pypy/module/exceptions/interp_exceptions.py
@@ -126,7 +126,7 @@
return space.call_function(space.w_unicode, w_as_str)
lgt = len(self.args_w)
if lgt == 0:
- return space.newunicode(u"")
+ return space.newutf8("", 0)
if lgt == 1:
return space.call_function(space.w_unicode, self.args_w[0])
else:
@@ -719,7 +719,7 @@
def descr_init(self, space, w_encoding, w_object, w_start, w_end,
w_reason):
# typechecking
space.realtext_w(w_encoding)
- space.utf8_w(w_object)
+ space.realutf8_w(w_object)
space.int_w(w_start)
space.int_w(w_end)
space.realtext_w(w_reason)
diff --git a/pypy/module/operator/tscmp.py b/pypy/module/operator/tscmp.py
--- a/pypy/module/operator/tscmp.py
+++ b/pypy/module/operator/tscmp.py
@@ -45,15 +45,15 @@
Note: If a and b are of different lengths, or if an error occurs, a
timing attack could theoretically reveal information about the types
and lengths of a and b--but not their values.
+
+ XXX note that here the strings have to have the same length as UTF8,
+ not only as unicode. Not sure how to do better
"""
if (space.isinstance_w(w_a, space.w_unicode) and
space.isinstance_w(w_b, space.w_unicode)):
- a = space.unicode_w(w_a)
- b = space.unicode_w(w_b)
- with rffi.scoped_nonmoving_unicodebuffer(a) as a_buf:
- with rffi.scoped_nonmoving_unicodebuffer(b) as b_buf:
- result = pypy_tscmp_wide(a_buf, b_buf, len(a), len(b))
- return space.newbool(rffi.cast(lltype.Bool, result))
+ a = space.utf8_w(w_a)
+ b = space.utf8_w(w_b)
+ return space.newbool(_compare_two_strings(a, b))
return compare_digest_buffer(space, w_a, w_b)
@@ -68,7 +68,10 @@
a = a_buf.as_str()
b = b_buf.as_str()
+ return space.newbool(_compare_two_strings(a, b))
+
+def _compare_two_strings(a, b):
with rffi.scoped_nonmovingbuffer(a) as a_buf:
with rffi.scoped_nonmovingbuffer(b) as b_buf:
result = pypy_tscmp(a_buf, b_buf, len(a), len(b))
- return space.newbool(rffi.cast(lltype.Bool, result))
+ return rffi.cast(lltype.Bool, result)
diff --git a/pypy/module/pyexpat/interp_pyexpat.py
b/pypy/module/pyexpat/interp_pyexpat.py
--- a/pypy/module/pyexpat/interp_pyexpat.py
+++ b/pypy/module/pyexpat/interp_pyexpat.py
@@ -478,8 +478,8 @@
# I suppose this is a valid utf8, but there is noone to check
# and noone to catch an error either
try:
- lgt, flag = rutf8.check_utf8(s, True)
- return space.newutf8(s, lgt, flag)
+ lgt = rutf8.check_utf8(s, True)
+ return space.newutf8(s, lgt)
except rutf8.CheckError:
from pypy.interpreter import unicodehelper
# get the correct error msg
diff --git a/pypy/objspace/fake/objspace.py b/pypy/objspace/fake/objspace.py
--- a/pypy/objspace/fake/objspace.py
+++ b/pypy/objspace/fake/objspace.py
@@ -209,10 +209,7 @@
def newbytes(self, x):
return w_some_obj()
- def newutf8(self, x, l, f):
- return w_some_obj()
-
- def new_from_utf8(self, a):
+ def newutf8(self, x, l):
return w_some_obj()
def newunicode(self, a):
diff --git a/pypy/objspace/std/bytearrayobject.py
b/pypy/objspace/std/bytearrayobject.py
--- a/pypy/objspace/std/bytearrayobject.py
+++ b/pypy/objspace/std/bytearrayobject.py
@@ -195,11 +195,11 @@
w_dict = self.getdict(space)
if w_dict is None:
w_dict = space.w_None
- s, _, lgt, flag = str_decode_latin_1(''.join(self.getdata()), 'strict',
+ s, _, lgt = str_decode_latin_1(''.join(self.getdata()), 'strict',
True, None)
return space.newtuple([
space.type(self), space.newtuple([
- space.newutf8(s, lgt, flag), space.newtext('latin-1')]),
+ space.newutf8(s, lgt), space.newtext('latin-1')]),
w_dict])
@staticmethod
diff --git a/pypy/objspace/std/dictmultiobject.py
b/pypy/objspace/std/dictmultiobject.py
--- a/pypy/objspace/std/dictmultiobject.py
+++ b/pypy/objspace/std/dictmultiobject.py
@@ -1197,7 +1197,7 @@
unerase = staticmethod(unerase)
def wrap(self, unwrapped):
- return self.space.newutf8(unwrapped, len(unwrapped), rutf8.FLAG_ASCII)
+ return self.space.newutf8(unwrapped, len(unwrapped))
def unwrap(self, wrapped):
return self.space.utf8_w(wrapped)
@@ -1239,7 +1239,7 @@
## return self.space.newlist_bytes(self.listview_bytes(w_dict))
def wrapkey(space, key):
- return space.newutf8(key, len(key), rutf8.FLAG_ASCII)
+ return space.newutf8(key, len(key))
## @jit.look_inside_iff(lambda self, w_dict:
## w_dict_unrolling_heuristic(w_dict))
diff --git a/pypy/objspace/std/formatting.py b/pypy/objspace/std/formatting.py
--- a/pypy/objspace/std/formatting.py
+++ b/pypy/objspace/std/formatting.py
@@ -198,8 +198,8 @@
if self.w_valuedict is None:
raise oefmt(space.w_TypeError, "format requires a mapping")
if do_unicode:
- lgt, flag = rutf8.check_utf8(key, True)
- w_key = space.newutf8(key, lgt, flag)
+ lgt = rutf8.check_utf8(key, True)
+ w_key = space.newutf8(key, lgt)
else:
w_key = space.newbytes(key)
return space.getitem(self.w_valuedict, w_key)
@@ -330,8 +330,7 @@
space = self.space
if do_unicode:
cp = rutf8.codepoint_at_pos(self.fmt, self.fmtpos - 1)
- flag = rutf8.get_flag_from_code(cp)
- w_s = space.newutf8(rutf8.unichr_as_utf8(cp), 1, flag)
+ w_s = space.newutf8(rutf8.unichr_as_utf8(cp), 1)
else:
cp = ord(self.fmt[self.fmtpos - 1])
w_s = space.newbytes(chr(cp))
@@ -513,8 +512,8 @@
formatter = UnicodeFormatter(space, fmt, values_w, w_valuedict)
result = formatter.format()
# this can force strings, not sure if it's a problem or not
- lgt, flag = rutf8.check_utf8(result, True)
- return space.newutf8(result, lgt, flag)
+ lgt = rutf8.check_utf8(result, True)
+ return space.newutf8(result, lgt)
def mod_format(space, w_format, w_values, do_unicode=False):
if space.isinstance_w(w_values, space.w_tuple):
diff --git a/pypy/objspace/std/listobject.py b/pypy/objspace/std/listobject.py
--- a/pypy/objspace/std/listobject.py
+++ b/pypy/objspace/std/listobject.py
@@ -1998,7 +1998,7 @@
def wrap(self, stringval):
assert stringval is not None
- return self.space.newutf8(stringval, len(stringval), rutf8.FLAG_ASCII)
+ return self.space.newutf8(stringval, len(stringval))
def unwrap(self, w_string):
return self.space.utf8_w(w_string)
diff --git a/pypy/objspace/std/marshal_impl.py
b/pypy/objspace/std/marshal_impl.py
--- a/pypy/objspace/std/marshal_impl.py
+++ b/pypy/objspace/std/marshal_impl.py
@@ -403,8 +403,8 @@
@unmarshaller(TYPE_UNICODE)
def unmarshal_unicode(space, u, tc):
arg = u.get_str()
- length, flag = unicodehelper.check_utf8_or_raise(space, arg)
- return space.newutf8(arg, length, flag)
+ length = unicodehelper.check_utf8_or_raise(space, arg)
+ return space.newutf8(arg, length)
@marshaller(W_SetObject)
def marshal_set(space, w_set, m):
diff --git a/pypy/objspace/std/newformat.py b/pypy/objspace/std/newformat.py
--- a/pypy/objspace/std/newformat.py
+++ b/pypy/objspace/std/newformat.py
@@ -51,8 +51,8 @@
if for_unicode:
def wrap(self, u):
- lgt, flag = rutf8.check_utf8(u, True)
- return self.space.newutf8(u, lgt, flag)
+ lgt = rutf8.check_utf8(u, True)
+ return self.space.newutf8(u, lgt)
else:
def wrap(self, s):
return self.space.newbytes(s)
@@ -379,8 +379,8 @@
template = unicode_template_formatter(space,
space.utf8_w(w_string))
r = template.build(args)
- lgt, flag = rutf8.check_utf8(r, True)
- return space.newutf8(r, lgt, flag)
+ lgt = rutf8.check_utf8(r, True)
+ return space.newutf8(r, lgt)
else:
template = str_template_formatter(space, space.bytes_w(w_string))
return space.newbytes(template.build(args))
@@ -416,8 +416,8 @@
if for_unicode:
def wrap(self, u):
- lgt, flag = rutf8.check_utf8(u, True)
- return self.space.newutf8(u, lgt, flag)
+ lgt = rutf8.check_utf8(u, True)
+ return self.space.newutf8(u, lgt)
else:
def wrap(self, s):
return self.space.newbytes(s)
diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py
--- a/pypy/objspace/std/objspace.py
+++ b/pypy/objspace/std/objspace.py
@@ -165,8 +165,8 @@
return self.newtext(x)
if isinstance(x, unicode):
x = x.encode('utf8')
- lgt, flag = rutf8.check_utf8(x, True)
- return self.newutf8(x, lgt, flag)
+ lgt = rutf8.check_utf8(x, True)
+ return self.newutf8(x, lgt)
if isinstance(x, float):
return W_FloatObject(x)
if isinstance(x, W_Root):
@@ -362,16 +362,10 @@
return self.w_None
return self.newtext(s)
- def newutf8(self, utf8s, length, flag):
+ def newutf8(self, utf8s, length):
assert utf8s is not None
assert isinstance(utf8s, str)
- return W_UnicodeObject(utf8s, length, flag)
-
- def new_from_utf8(self, utf8s):
- # XXX: kill me!
- assert isinstance(utf8s, str)
- length, flag = rutf8.check_utf8(utf8s, True)
- return W_UnicodeObject(utf8s, length, flag)
+ return W_UnicodeObject(utf8s, length)
def newfilename(self, s):
assert isinstance(s, str) # on pypy3, this decodes the byte string
diff --git a/pypy/objspace/std/setobject.py b/pypy/objspace/std/setobject.py
--- a/pypy/objspace/std/setobject.py
+++ b/pypy/objspace/std/setobject.py
@@ -1291,7 +1291,7 @@
return self.space.utf8_w(w_item)
def wrap(self, item):
- return self.space.newutf8(item, len(item), rutf8.FLAG_ASCII)
+ return self.space.newutf8(item, len(item))
def iter(self, w_set):
return UnicodeIteratorImplementation(self.space, self, w_set)
@@ -1495,7 +1495,7 @@
def next_entry(self):
for key in self.iterator:
- return self.space.newutf8(key, len(key), rutf8.FLAG_ASCII)
+ return self.space.newutf8(key, len(key))
else:
return None
diff --git a/pypy/objspace/std/test/test_index.py
b/pypy/objspace/std/test/test_index.py
--- a/pypy/objspace/std/test/test_index.py
+++ b/pypy/objspace/std/test/test_index.py
@@ -265,8 +265,7 @@
class AppTest_UnicodeTestCase(SeqTestCase, StringTestCase):
def setup_method(self, method):
SeqTestCase.setup_method(self, method)
- self.w_seq = self.space.newutf8("this is a test", len("this is a
test"),
- rutf8.FLAG_ASCII)
+ self.w_seq = self.space.newutf8("this is a test", len("this is a
test"))
self.w_const = self.space.appexec([], """(): return unicode""")
diff --git a/pypy/objspace/std/test/test_lengthhint.py
b/pypy/objspace/std/test/test_lengthhint.py
--- a/pypy/objspace/std/test/test_lengthhint.py
+++ b/pypy/objspace/std/test/test_lengthhint.py
@@ -74,8 +74,7 @@
self._test_length_hint(self.space.wrap('P' * self.SIZE))
def test_unicode(self):
- self._test_length_hint(self.space.newutf8('Y' * self.SIZE, self.SIZE,
- rutf8.FLAG_ASCII))
+ self._test_length_hint(self.space.newutf8('Y' * self.SIZE, self.SIZE))
def test_tuple(self):
self._test_length_hint(self.space.wrap(tuple(self.ITEMS)))
diff --git a/pypy/objspace/std/test/test_liststrategies.py
b/pypy/objspace/std/test/test_liststrategies.py
--- a/pypy/objspace/std/test/test_liststrategies.py
+++ b/pypy/objspace/std/test/test_liststrategies.py
@@ -600,9 +600,9 @@
def test_unicode(self):
l1 = W_ListObject(self.space, [self.space.newbytes("eins"),
self.space.newbytes("zwei")])
assert isinstance(l1.strategy, BytesListStrategy)
- l2 = W_ListObject(self.space, [self.space.newutf8("eins", 4, 2),
self.space.newutf8("zwei", 4, 2)])
+ l2 = W_ListObject(self.space, [self.space.newutf8("eins", 4),
self.space.newutf8("zwei", 4)])
assert isinstance(l2.strategy, UnicodeListStrategy)
- l3 = W_ListObject(self.space, [self.space.newbytes("eins"),
self.space.newutf8("zwei", 4, 2)])
+ l3 = W_ListObject(self.space, [self.space.newbytes("eins"),
self.space.newutf8("zwei", 4)])
assert isinstance(l3.strategy, ObjectListStrategy)
def test_listview_bytes(self):
diff --git a/pypy/objspace/std/test/test_obj.py
b/pypy/objspace/std/test/test_obj.py
--- a/pypy/objspace/std/test/test_obj.py
+++ b/pypy/objspace/std/test/test_obj.py
@@ -17,7 +17,7 @@
cls.w_cpython_apptest = space.wrap(option.runappdirect and not
hasattr(sys, 'pypy_translation_info'))
def w_unwrap_wrap_unicode(space, w_obj):
- return space.newutf8(space.utf8_w(w_obj), w_obj._length,
w_obj._get_flag())
+ return space.newutf8(space.utf8_w(w_obj), w_obj._length)
cls.w_unwrap_wrap_unicode =
space.wrap(gateway.interp2app(w_unwrap_wrap_unicode))
def w_unwrap_wrap_str(space, w_obj):
return space.wrap(space.str_w(w_obj))
diff --git a/pypy/objspace/std/test/test_unicodeobject.py
b/pypy/objspace/std/test/test_unicodeobject.py
--- a/pypy/objspace/std/test/test_unicodeobject.py
+++ b/pypy/objspace/std/test/test_unicodeobject.py
@@ -27,12 +27,12 @@
assert len(warnings) == 2
def test_listview_unicode(self):
- w_str = self.space.newutf8('abcd', 4, rutf8.FLAG_ASCII)
+ w_str = self.space.newutf8('abcd', 4)
assert self.space.listview_utf8(w_str) == list("abcd")
def test_new_shortcut(self):
space = self.space
- w_uni = self.space.newutf8('abcd', 4, rutf8.FLAG_ASCII)
+ w_uni = self.space.newutf8('abcd', 4)
w_new = space.call_method(
space.w_unicode, "__new__", space.w_unicode, w_uni)
assert w_new is w_uni
@@ -44,8 +44,8 @@
return # skip this case
v = u[start : start + len1]
space = self.space
- w_u = space.newutf8(u.encode('utf8'), len(u), rutf8.FLAG_REGULAR)
- w_v = space.newutf8(v.encode('utf8'), len(v), rutf8.FLAG_REGULAR)
+ w_u = space.newutf8(u.encode('utf8'), len(u))
+ w_v = space.newutf8(v.encode('utf8'), len(v))
expected = u.find(v, start, start + len1)
try:
w_index = space.call_method(w_u, 'index', w_v,
@@ -741,6 +741,8 @@
assert u'\u20ac'.encode('utf-8') == '\xe2\x82\xac'
assert u'\ud800\udc02'.encode('utf-8') == '\xf0\x90\x80\x82'
assert u'\ud84d\udc56'.encode('utf-8') == '\xf0\xa3\x91\x96'
+ assert u'\ud800\udc02'.encode('uTf-8') == '\xf0\x90\x80\x82'
+ assert u'\ud84d\udc56'.encode('Utf8') == '\xf0\xa3\x91\x96'
assert u'\ud800'.encode('utf-8') == '\xed\xa0\x80'
assert u'\udc00'.encode('utf-8') == '\xed\xb0\x80'
assert (u'\ud800\udc02'*1000).encode('utf-8') ==
'\xf0\x90\x80\x82'*1000
diff --git a/pypy/objspace/std/unicodeobject.py
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -35,39 +35,22 @@
_immutable_fields_ = ['_utf8']
@enforceargs(utf8str=str)
- def __init__(self, utf8str, length, flag):
+ def __init__(self, utf8str, length):
assert isinstance(utf8str, str)
assert length >= 0
self._utf8 = utf8str
self._length = length
- if flag == rutf8.FLAG_ASCII:
- self._index_storage = rutf8.UTF8_IS_ASCII
- elif flag == rutf8.FLAG_HAS_SURROGATES:
- self._index_storage = rutf8.UTF8_HAS_SURROGATES
- else:
- assert flag == rutf8.FLAG_REGULAR
- self._index_storage = rutf8.null_storage()
+ self._index_storage = rutf8.null_storage()
# XXX checking, remove before any performance measurments
# ifdef not_running_in_benchmark
- lgt, flag_check = rutf8.check_utf8(utf8str, True)
- assert lgt == length
- if flag_check == rutf8.FLAG_ASCII:
- # there are cases where we copy part of REULAR that happens
- # to be ascii
- assert flag in (rutf8.FLAG_ASCII, rutf8.FLAG_REGULAR)
- else:
- assert flag == flag_check
- # the storage can be one of:
- # - null, unicode with no surrogates
- # - rutf8.UTF8_HAS_SURROGATES
- # - rutf8.UTF8_IS_ASCII
- # - malloced object, which means it has index, then
- # _index_storage.flags determines the kind
+ if not we_are_translated():
+ lgt = rutf8.check_utf8(utf8str, True)
+ assert lgt == length
@staticmethod
def from_utf8builder(builder):
return W_UnicodeObject(
- builder.build(), builder.get_length(), builder.get_flag())
+ builder.build(), builder.get_length())
def __repr__(self):
"""representation for debugging purposes"""
@@ -107,8 +90,6 @@
return space.text_w(space.str(self))
def utf8_w(self, space):
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit