Author: Armin Rigo <ar...@tunes.org> Branch: unicode-utf8-re Changeset: r93304:be4b4c164598 Date: 2017-12-08 11:46 +0100 http://bitbucket.org/pypy/pypy/changeset/be4b4c164598/
Log: hg merge unicode-utf8 diff too long, truncating to 2000 out of 3797 lines diff --git a/TODO b/TODO --- a/TODO +++ b/TODO @@ -9,5 +9,6 @@ * remove assertions from W_UnicodeObject.__init__ if all the builders pass * what to do with error handlers that go backwards. There were tests in test_codecs that would check for that +* improve performance of splitlines * fix _pypyjson to not use a wrapped dict when decoding an object diff --git a/extra_tests/test_textio.py b/extra_tests/test_textio.py --- a/extra_tests/test_textio.py +++ b/extra_tests/test_textio.py @@ -1,28 +1,48 @@ from hypothesis import given, strategies as st from io import BytesIO, TextIOWrapper +import os -LINESEP = ['', '\r', '\n', '\r\n'] +def translate_newlines(text): + text = text.replace('\r\n', '\n') + text = text.replace('\r', '\n') + return text.replace('\n', os.linesep) @st.composite -def text_with_newlines(draw): - sep = draw(st.sampled_from(LINESEP)) - lines = draw(st.lists(st.text(max_size=10), max_size=10)) - return sep.join(lines) +def st_readline_universal( + draw, st_nlines=st.integers(min_value=0, max_value=10)): + n_lines = draw(st_nlines) + lines = draw(st.lists( + st.text(st.characters(blacklist_characters='\r\n')), + min_size=n_lines, max_size=n_lines)) + limits = [] + for line in lines: + limit = draw(st.integers(min_value=0, max_value=len(line) + 5)) + limits.append(limit) + limits.append(-1) + endings = draw(st.lists( + st.sampled_from(['\n', '\r', '\r\n']), + min_size=n_lines, max_size=n_lines)) + return ( + ''.join(line + ending for line, ending in zip(lines, endings)), + limits) -@given(txt=text_with_newlines(), - mode=st.sampled_from(['\r', '\n', '\r\n', '']), - limit=st.integers(min_value=-1)) -def test_readline(txt, mode, limit): +@given(data=st_readline_universal(), + mode=st.sampled_from(['\r', '\n', '\r\n', '', None])) +def test_readline(data, mode): + txt, limits = data textio = TextIOWrapper( - BytesIO(txt.encode('utf-8')), encoding='utf-8', newline=mode) + BytesIO(txt.encode('utf-8', 'surrogatepass')), + encoding='utf-8', errors='surrogatepass', newline=mode) lines = [] - while True: + for limit in limits: line = textio.readline(limit) - if limit > 0: - assert len(line) < limit + if limit >= 0: + assert len(line) <= limit if line: lines.append(line) - else: + elif limit: break - assert u''.join(lines) == txt + if mode is None: + txt = translate_newlines(txt) + assert txt.startswith(u''.join(lines)) diff --git a/lib_pypy/resource.py b/lib_pypy/resource.py --- a/lib_pypy/resource.py +++ b/lib_pypy/resource.py @@ -20,6 +20,7 @@ or via the attributes ru_utime, ru_stime, ru_maxrss, and so on.""" __metaclass__ = _structseq.structseqtype + name = "resource.struct_rusage" ru_utime = _structseq.structseqfield(0, "user time used") ru_stime = _structseq.structseqfield(1, "system time used") diff --git a/pypy/doc/whatsnew-head.rst b/pypy/doc/whatsnew-head.rst --- a/pypy/doc/whatsnew-head.rst +++ b/pypy/doc/whatsnew-head.rst @@ -26,3 +26,6 @@ .. branch: fix-vmprof-stacklet-switch Fix a vmprof+continulets (i.e. greenelts, eventlet, gevent, ...) + +.. branch: win32-vcvars + diff --git a/pypy/doc/windows.rst b/pypy/doc/windows.rst --- a/pypy/doc/windows.rst +++ b/pypy/doc/windows.rst @@ -25,8 +25,10 @@ This compiler, while the standard one for Python 2.7, is deprecated. Microsoft has made it available as the `Microsoft Visual C++ Compiler for Python 2.7`_ (the link -was checked in Nov 2016). Note that the compiler suite will be installed in -``C:\Users\<user name>\AppData\Local\Programs\Common\Microsoft\Visual C++ for Python``. +was checked in Nov 2016). Note that the compiler suite may be installed in +``C:\Users\<user name>\AppData\Local\Programs\Common\Microsoft\Visual C++ for Python`` +or in +``C:\Program Files (x86)\Common Files\Microsoft\Visual C++ for Python``. A current version of ``setuptools`` will be able to find it there. For Windows 10, you must right-click the download, and under ``Properties`` -> ``Compatibility`` mark it as ``Run run this program in comatibility mode for`` @@ -41,7 +43,6 @@ ----------------------------------- We routinely test translation using v9, also known as Visual Studio 2008. -Our buildbot is still using the Express Edition, not the compiler noted above. Other configurations may work as well. The translation scripts will set up the appropriate environment variables @@ -81,6 +82,30 @@ .. _build instructions: http://pypy.org/download.html#building-from-source +Setting Up Visual Studio for building SSL in Python3 +---------------------------------------------------- + +On Python3, the ``ssl`` module is based on ``cffi``, and requires a build step after +translation. However ``distutils`` does not support the Micorosft-provided Visual C +compiler, and ``cffi`` depends on ``distutils`` to find the compiler. The +traditional solution to this problem is to install the ``setuptools`` module +via running ``-m ensurepip`` which installs ``pip`` and ``setuptools``. However +``pip`` requires ``ssl``. So we have a chicken-and-egg problem: ``ssl`` depends on +``cffi`` which depends on ``setuptools``, which depends on ``ensurepip``, which +depends on ``ssl``. + +In order to solve this, the buildbot sets an environment varaible that helps +``distutils`` find the compiler without ``setuptools``:: + + set VS90COMNTOOLS=C:\Program Files (x86)\Common Files\Microsoft\Visual C++ for Python\9.0\VC\bin + +or whatever is appropriate for your machine. Note that this is not enough, you +must also copy the ``vcvarsall.bat`` file fron the ``...\9.0`` directory to the +``...\9.0\VC`` directory, and edit it, changing the lines that set +``VCINSTALLDIR`` and ``WindowsSdkDir``:: + set VCINSTALLDIR=%~dp0\ + set WindowsSdkDir=%~dp0\..\WinSDK\ + Preparing Windows for the large build ------------------------------------- diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py --- a/pypy/interpreter/baseobjspace.py +++ b/pypy/interpreter/baseobjspace.py @@ -1087,8 +1087,11 @@ def newlist_utf8(self, list_u, is_ascii): l_w = [None] * len(list_u) for i, item in enumerate(list_u): - length, flag = rutf8.check_utf8(item, True) - l_w[i] = self.newutf8(item, length, flag) + if not is_ascii: + length = rutf8.check_utf8(item, True) + else: + length = len(item) + l_w[i] = self.newutf8(item, length) return self.newlist(l_w) def newlist_int(self, list_i): diff --git a/pypy/interpreter/pyparser/parsestring.py b/pypy/interpreter/pyparser/parsestring.py --- a/pypy/interpreter/pyparser/parsestring.py +++ b/pypy/interpreter/pyparser/parsestring.py @@ -64,8 +64,8 @@ r = unicodehelper.decode_raw_unicode_escape(space, substr) else: r = unicodehelper.decode_unicode_escape(space, substr) - v, length, flag = r - return space.newutf8(v, length, flag) + v, length = r + return space.newutf8(v, length) need_encoding = (encoding is not None and encoding != "utf-8" and encoding != "utf8" and @@ -74,8 +74,8 @@ substr = s[ps : q] if rawmode or '\\' not in s[ps:]: if need_encoding: - lgt, flag = unicodehelper.check_utf8_or_raise(space, substr) - w_u = space.newutf8(substr, lgt, flag) + lgt = unicodehelper.check_utf8_or_raise(space, substr) + w_u = space.newutf8(substr, lgt) w_v = unicodehelper.encode(space, w_u, encoding) return w_v else: @@ -234,8 +234,8 @@ p = ps while p < end and ord(s[p]) & 0x80: p += 1 - lgt, flag = unicodehelper.check_utf8_or_raise(space, s, ps, p) - w_v = unicodehelper.encode(space, space.newutf8(s[ps:p], lgt, flag), + lgt = unicodehelper.check_utf8_or_raise(space, s, ps, p) + w_v = unicodehelper.encode(space, space.newutf8(s[ps:p], lgt), recode_encoding) v = space.bytes_w(w_v) return v, p diff --git a/pypy/interpreter/test/test_unicodehelper.py b/pypy/interpreter/test/test_unicodehelper.py --- a/pypy/interpreter/test/test_unicodehelper.py +++ b/pypy/interpreter/test/test_unicodehelper.py @@ -10,13 +10,13 @@ return str_decode_utf8(u, True, "strict", None) def test_decode_utf8(): - assert decode_utf8("abc") == ("abc", 3, 3, rutf8.FLAG_ASCII) - assert decode_utf8("\xe1\x88\xb4") == ("\xe1\x88\xb4", 3, 1, rutf8.FLAG_REGULAR) - assert decode_utf8("\xed\xa0\x80") == ("\xed\xa0\x80", 3, 1, rutf8.FLAG_HAS_SURROGATES) - assert decode_utf8("\xed\xb0\x80") == ("\xed\xb0\x80", 3, 1, rutf8.FLAG_HAS_SURROGATES) + assert decode_utf8("abc") == ("abc", 3, 3) + assert decode_utf8("\xe1\x88\xb4") == ("\xe1\x88\xb4", 3, 1) + assert decode_utf8("\xed\xa0\x80") == ("\xed\xa0\x80", 3, 1) + assert decode_utf8("\xed\xb0\x80") == ("\xed\xb0\x80", 3, 1) assert decode_utf8("\xed\xa0\x80\xed\xb0\x80") == ( - "\xed\xa0\x80\xed\xb0\x80", 6, 2, rutf8.FLAG_HAS_SURROGATES) - assert decode_utf8("\xf0\x90\x80\x80") == ("\xf0\x90\x80\x80", 4, 1, rutf8.FLAG_REGULAR) + "\xed\xa0\x80\xed\xb0\x80", 6, 2) + assert decode_utf8("\xf0\x90\x80\x80") == ("\xf0\x90\x80\x80", 4, 1) def test_utf8_encode_ascii(): assert utf8_encode_ascii("abc", "??", "??") == "abc" @@ -41,19 +41,19 @@ assert utf8_encode_ascii(u.encode("utf8"), "replace", eh) == u.encode("ascii", "replace") def test_str_decode_ascii(): - assert str_decode_ascii("abc", "??", True, "??") == ("abc", 3, 3, rutf8.FLAG_ASCII) + assert str_decode_ascii("abc", "??", True, "??") == ("abc", 3, 3) def eh(errors, encoding, reason, p, start, end): lst.append((errors, encoding, p, start, end)) return u"\u1234\u5678".encode("utf8"), end lst = [] input = "\xe8" exp = u"\u1234\u5678".encode("utf8") - assert str_decode_ascii(input, "??", True, eh) == (exp, 1, 2, rutf8.FLAG_REGULAR) + assert str_decode_ascii(input, "??", True, eh) == (exp, 1, 2) assert lst == [("??", "ascii", input, 0, 1)] lst = [] input = "\xe8\xe9abc\xea\xeb" assert str_decode_ascii(input, "??", True, eh) == ( - exp + exp + "abc" + exp + exp, 7, 11, rutf8.FLAG_REGULAR) + exp + exp + "abc" + exp + exp, 7, 11) assert lst == [("??", "ascii", input, 0, 1), ("??", "ascii", input, 1, 2), ("??", "ascii", input, 5, 6), diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py --- a/pypy/interpreter/unicodehelper.py +++ b/pypy/interpreter/unicodehelper.py @@ -3,7 +3,6 @@ from pypy.interpreter.error import OperationError from rpython.rlib.objectmodel import specialize from rpython.rlib import rutf8 -from rpython.rlib.rutf8 import combine_flags from rpython.rlib.rarithmetic import r_uint, intmask from rpython.rlib.rstring import StringBuilder from pypy.module._codecs import interp_codecs @@ -26,10 +25,10 @@ # Fast version of the "strict" errors handler. def raise_unicode_exception_encode(errors, encoding, msg, utf8, startingpos, endingpos): - u_len, flag = rutf8.check_utf8(utf8, True) + u_len = rutf8.check_utf8(utf8, True) raise OperationError(space.w_UnicodeEncodeError, space.newtuple([space.newtext(encoding), - space.newutf8(utf8, u_len, flag), + space.newutf8(utf8, u_len), space.newint(startingpos), space.newint(endingpos), space.newtext(msg)])) @@ -55,18 +54,18 @@ def decode_unicode_escape(space, string): state = space.fromcache(interp_codecs.CodecState) unicodedata_handler = state.get_unicodedata_handler(space) - result_utf8, consumed, length, flag = str_decode_unicode_escape( + result_utf8, consumed, length = str_decode_unicode_escape( string, "strict", final=True, errorhandler=decode_error_handler(space), ud_handler=unicodedata_handler) - return result_utf8, length, flag + return result_utf8, length def decode_raw_unicode_escape(space, string): - result_utf8, consumed, lgt, flag = str_decode_raw_unicode_escape( + result_utf8, consumed, lgt = str_decode_raw_unicode_escape( string, "strict", final=True, errorhandler=decode_error_handler(space)) - return result_utf8, lgt, flag + return result_utf8, lgt def check_ascii_or_raise(space, string): try: @@ -83,19 +82,19 @@ # you still get two surrogate unicode characters in the result. # These are the Python2 rules; Python3 differs. try: - length, flag = rutf8.check_utf8(string, True, start, end) + length = rutf8.check_utf8(string, True, start, end) except rutf8.CheckError as e: # convert position into unicode position - lgt, flags = rutf8.check_utf8(string, True, start, stop=e.pos) + lgt = rutf8.check_utf8(string, True, start, stop=e.pos) decode_error_handler(space)('strict', 'utf8', 'invalid utf-8', string, start + lgt, start + lgt + 1) assert False, "unreachable" - return length, flag + return length def str_decode_ascii(s, errors, final, errorhandler): try: rutf8.check_ascii(s) - return s, len(s), len(s), rutf8.FLAG_ASCII + return s, len(s), len(s) except rutf8.CheckError: return _str_decode_ascii_slowpath(s, errors, final, errorhandler) @@ -112,13 +111,13 @@ res.append(ch) i += 1 ress = res.build() - lgt, flag = rutf8.check_utf8(ress, True) - return ress, len(s), lgt, flag + lgt = rutf8.check_utf8(ress, True) + return ress, len(s), lgt def str_decode_latin_1(s, errors, final, errorhandler): try: rutf8.check_ascii(s) - return s, len(s), len(s), rutf8.FLAG_ASCII + return s, len(s), len(s) except rutf8.CheckError: return _str_decode_latin_1_slowpath(s, errors, final, errorhandler) @@ -138,7 +137,7 @@ res.append_slice(s, start, end) i = end # cannot be ASCII, cannot have surrogates, I believe - return res.build(), len(s), len(s), rutf8.FLAG_REGULAR + return res.build(), len(s), len(s) def utf8_encode_latin_1(s, errors, errorhandler): try: @@ -149,37 +148,32 @@ def _utf8_encode_latin_1_slowpath(s, errors, errorhandler): res = StringBuilder(len(s)) - size = len(s) cur = 0 - i = 0 - while i < size: - if ord(s[i]) <= 0x7F: - res.append(s[i]) - i += 1 + iter = rutf8.Utf8StringIterator(s) + while True: + try: + ch = iter.next() + except StopIteration: + break + if ch <= 0xFF: + res.append(chr(ch)) cur += 1 else: - oc = rutf8.codepoint_at_pos(s, i) - if oc <= 0xFF: - res.append(chr(oc)) - cur += 1 - i = rutf8.next_codepoint_pos(s, i) - else: - r, pos = errorhandler(errors, 'latin1', - 'ordinal not in range(256)', s, cur, - cur + 1) - for j in range(pos - cur): - i = rutf8.next_codepoint_pos(s, i) + r, pos = errorhandler(errors, 'latin1', + 'ordinal not in range(256)', s, cur, + cur + 1) - j = 0 - while j < len(r): - c = rutf8.codepoint_at_pos(r, j) - if c > 0xFF: - errorhandler("strict", 'latin1', - 'ordinal not in range(256)', s, - cur, cur + 1) - j = rutf8.next_codepoint_pos(r, j) - res.append(chr(c)) - cur = pos + for c in rutf8.Utf8StringIterator(r): + if c > 0xFF: + errorhandler("strict", 'latin1', + 'ordinal not in range(256)', s, + cur, cur + 1) + res.append(chr(c)) + + for j in range(pos - cur - 1): + iter.next() + + cur = pos r = res.build() return r @@ -341,8 +335,7 @@ res.append(r) r = res.build() - lgt, flag = rutf8.check_utf8(r, True) - return r, pos, lgt, flag + return r, pos, rutf8.check_utf8(r, True) hexdigits = "0123456789ABCDEFabcdef" @@ -355,7 +348,7 @@ endinpos += 1 res, pos = errorhandler(errors, encoding, message, s, pos-2, endinpos) - size, flag = rutf8.check_utf8(res, True) + size = rutf8.check_utf8(res, True) builder.append(res) else: try: @@ -366,7 +359,7 @@ endinpos += 1 res, pos = errorhandler(errors, encoding, message, s, pos-2, endinpos) - size, flag = rutf8.check_utf8(res, True) + size = rutf8.check_utf8(res, True) builder.append(res) else: # when we get here, chr is a 32-bit unicode character @@ -376,21 +369,19 @@ message = "illegal Unicode character" res, pos = errorhandler(errors, encoding, message, s, pos-2, pos+digits) - size, flag = rutf8.check_utf8(res, True) + size = rutf8.check_utf8(res, True) builder.append(res) else: - flag = rutf8.get_flag_from_code(intmask(chr)) pos += digits size = 1 - return pos, size, flag + return pos, size def str_decode_unicode_escape(s, errors, final, errorhandler, ud_handler): size = len(s) if size == 0: - return '', 0, 0, rutf8.FLAG_ASCII + return '', 0, 0 - flag = rutf8.FLAG_ASCII builder = StringBuilder(size) pos = 0 outsize = 0 @@ -401,7 +392,6 @@ if ch != '\\': if ord(ch) > 0x7F: rutf8.unichr_as_utf8_append(builder, ord(ch)) - flag = combine_flags(rutf8.FLAG_REGULAR, flag) else: builder.append(ch) pos += 1 @@ -414,9 +404,8 @@ message = "\\ at end of string" res, pos = errorhandler(errors, "unicodeescape", message, s, pos-1, size) - newsize, newflag = rutf8.check_utf8(res, True) + newsize = rutf8.check_utf8(res, True) outsize + newsize - flag = combine_flags(flag, newflag) builder.append(res) continue @@ -469,7 +458,6 @@ outsize += 1 if x > 0x7F: rutf8.unichr_as_utf8_append(builder, x) - flag = combine_flags(rutf8.FLAG_REGULAR, flag) else: builder.append(chr(x)) # hex escapes @@ -477,27 +465,24 @@ elif ch == 'x': digits = 2 message = "truncated \\xXX escape" - pos, newsize, newflag = hexescape(builder, s, pos, digits, + pos, newsize = hexescape(builder, s, pos, digits, "unicodeescape", errorhandler, message, errors) - flag = combine_flags(flag, newflag) outsize += newsize # \uXXXX elif ch == 'u': digits = 4 message = "truncated \\uXXXX escape" - pos, newsize, newflag = hexescape(builder, s, pos, digits, + pos, newsize = hexescape(builder, s, pos, digits, "unicodeescape", errorhandler, message, errors) - flag = combine_flags(flag, newflag) outsize += newsize # \UXXXXXXXX elif ch == 'U': digits = 8 message = "truncated \\UXXXXXXXX escape" - pos, newsize, newflag = hexescape(builder, s, pos, digits, + pos, newsize = hexescape(builder, s, pos, digits, "unicodeescape", errorhandler, message, errors) - flag = combine_flags(flag, newflag) outsize += newsize # \N{name} @@ -517,29 +502,25 @@ if code < 0: res, pos = errorhandler(errors, "unicodeescape", message, s, pos-1, look+1) - newsize, newflag = rutf8.check_utf8(res, True) - flag = combine_flags(flag, newflag) + newsize = rutf8.check_utf8(res, True) outsize += newsize builder.append(res) continue pos = look + 1 outsize += 1 - flag = combine_flags(flag, rutf8.get_flag_from_code(code)) rutf8.unichr_as_utf8_append(builder, code, allow_surrogates=True) # xxx 'code' is probably always within range here... else: res, pos = errorhandler(errors, "unicodeescape", message, s, pos-1, look+1) - newsize, newflag = rutf8.check_utf8(res, True) - flag = combine_flags(flag, newflag) + newsize = rutf8.check_utf8(res, True) outsize += newsize builder.append(res) else: res, pos = errorhandler(errors, "unicodeescape", message, s, pos-1, look+1) - newsize, newflag = rutf8.check_utf8(res, True) - flag = combine_flags(flag, newflag) + newsize = rutf8.check_utf8(res, True) outsize += newsize builder.append(res) else: @@ -547,7 +528,7 @@ builder.append(ch) outsize += 2 - return builder.build(), pos, outsize, flag + return builder.build(), pos, outsize # ____________________________________________________________ # Raw unicode escape @@ -556,7 +537,7 @@ errorhandler=None): size = len(s) if size == 0: - return '', 0, 0, rutf8.FLAG_ASCII + return '', 0, 0 result = StringBuilder(size) pos = 0 @@ -598,8 +579,8 @@ "rawunicodeescape", errorhandler, message, errors) r = result.build() - lgt, flag = rutf8.check_utf8(r, True) - return r, pos, lgt, flag + lgt = rutf8.check_utf8(r, True) + return r, pos, lgt _utf8_encode_unicode_escape = rutf8.make_utf8_escape_function() @@ -734,7 +715,7 @@ errorhandler=None): size = len(s) if size == 0: - return '', 0, 0, rutf8.FLAG_ASCII + return '', 0, 0 inShift = False base64bits = 0 @@ -745,7 +726,6 @@ result = StringBuilder(size) pos = 0 shiftOutStartPos = 0 - flag = rutf8.FLAG_ASCII startinpos = 0 while pos < size: ch = s[pos] @@ -771,13 +751,11 @@ (outCh & 0x3FF)) + 0x10000 rutf8.unichr_as_utf8_append(result, code) outsize += 1 - flag = combine_flags(flag, rutf8.FLAG_REGULAR) surrogate = 0 continue else: rutf8.unichr_as_utf8_append(result, surrogate, allow_surrogates=True) - flag = rutf8.FLAG_HAS_SURROGATES outsize += 1 surrogate = 0 # Not done with outCh: falls back to next line @@ -785,8 +763,6 @@ # first surrogate surrogate = outCh else: - flag = combine_flags(flag, - rutf8.get_flag_from_code(outCh)) outsize += 1 assert outCh >= 0 rutf8.unichr_as_utf8_append(result, outCh, True) @@ -802,9 +778,8 @@ msg = "partial character in shift sequence" res, pos = errorhandler(errors, 'utf7', msg, s, pos-1, pos) - reslen, resflags = rutf8.check_utf8(res, True) + reslen = rutf8.check_utf8(res, True) outsize += reslen - flag = combine_flags(flag, resflags) result.append(res) continue else: @@ -814,15 +789,13 @@ msg = "non-zero padding bits in shift sequence" res, pos = errorhandler(errors, 'utf7', msg, s, pos-1, pos) - reslen, resflags = rutf8.check_utf8(res, True) + reslen = rutf8.check_utf8(res, True) outsize += reslen - flag = combine_flags(flag, resflags) result.append(res) continue if surrogate and _utf7_DECODE_DIRECT(ord(ch)): outsize += 1 - flag = rutf8.FLAG_HAS_SURROGATES rutf8.unichr_as_utf8_append(result, surrogate, True) surrogate = 0 @@ -854,9 +827,8 @@ pos += 1 msg = "unexpected special character" res, pos = errorhandler(errors, 'utf7', msg, s, pos-1, pos) - reslen, resflags = rutf8.check_utf8(res, True) + reslen = rutf8.check_utf8(res, True) outsize += reslen - flag = combine_flags(flag, resflags) result.append(res) # end of string @@ -869,9 +841,8 @@ (base64bits > 0 and base64buffer != 0)): msg = "unterminated shift sequence" res, pos = errorhandler(errors, 'utf7', msg, s, shiftOutStartPos, pos) - reslen, resflags = rutf8.check_utf8(res, True) + reslen = rutf8.check_utf8(res, True) outsize += reslen - flag = combine_flags(flag, resflags) result.append(res) final_length = result.getlength() elif inShift: @@ -879,7 +850,7 @@ final_length = shiftOutStartPos # back off output assert final_length >= 0 - return result.build()[:final_length], pos, outsize, flag + return result.build()[:final_length], pos, outsize def utf8_encode_utf_7(s, errors, errorhandler): size = len(s) @@ -942,21 +913,21 @@ def str_decode_utf_16(s, errors, final=True, errorhandler=None): - result, c, lgt, flag, _ = str_decode_utf_16_helper(s, errors, final, + result, c, lgt, _ = str_decode_utf_16_helper(s, errors, final, errorhandler, "native") - return result, c, lgt, flag + return result, c, lgt def str_decode_utf_16_be(s, errors, final=True, errorhandler=None): - result, c, lgt, flag, _ = str_decode_utf_16_helper(s, errors, final, + result, c, lgt, _ = str_decode_utf_16_helper(s, errors, final, errorhandler, "big") - return result, c, lgt, flag + return result, c, lgt def str_decode_utf_16_le(s, errors, final=True, errorhandler=None): - result, c, lgt, flag, _ = str_decode_utf_16_helper(s, errors, final, + result, c, lgt, _ = str_decode_utf_16_helper(s, errors, final, errorhandler, "little") - return result, c, lgt, flag + return result, c, lgt def str_decode_utf_16_helper(s, errors, final=True, errorhandler=None, @@ -999,7 +970,7 @@ else: bo = 1 if size == 0: - return '', 0, 0, rutf8.FLAG_ASCII, bo + return '', 0, 0, bo if bo == -1: # force little endian ihi = 1 @@ -1058,8 +1029,8 @@ s, pos - 2, pos) result.append(r) r = result.build() - lgt, flag = rutf8.check_utf8(r, True) - return result.build(), pos, lgt, flag, bo + lgt = rutf8.check_utf8(r, True) + return result.build(), pos, lgt, bo def _STORECHAR(result, CH, byteorder): hi = chr(((CH) >> 8) & 0xff) @@ -1148,21 +1119,21 @@ def str_decode_utf_32(s, errors, final=True, errorhandler=None): - result, c, lgt, flag, _ = str_decode_utf_32_helper(s, errors, final, + result, c, lgt, _ = str_decode_utf_32_helper(s, errors, final, errorhandler, "native") - return result, c, lgt, flag + return result, c, lgt def str_decode_utf_32_be(s, errors, final=True, errorhandler=None): - result, c, lgt, flag, _ = str_decode_utf_32_helper(s, errors, final, + result, c, lgt, _ = str_decode_utf_32_helper(s, errors, final, errorhandler, "big") - return result, c, lgt, flag + return result, c, lgt def str_decode_utf_32_le(s, errors, final=True, errorhandler=None): - result, c, lgt, flag, _ = str_decode_utf_32_helper(s, errors, final, + result, c, lgt, _ = str_decode_utf_32_helper(s, errors, final, errorhandler, "little") - return result, c, lgt, flag + return result, c, lgt BOM32_DIRECT = intmask(0x0000FEFF) BOM32_REVERSE = intmask(0xFFFE0000) @@ -1208,7 +1179,7 @@ else: bo = 1 if size == 0: - return '', 0, 0, rutf8.FLAG_ASCII, bo + return '', 0, 0, bo if bo == -1: # force little endian iorder = [0, 1, 2, 3] @@ -1243,8 +1214,8 @@ rutf8.unichr_as_utf8_append(result, ch, allow_surrogates=True) pos += 4 r = result.build() - lgt, flag = rutf8.check_utf8(r, True) - return r, pos, lgt, flag, bo + lgt = rutf8.check_utf8(r, True) + return r, pos, lgt, bo def _STORECHAR32(result, CH, byteorder): c0 = chr(((CH) >> 24) & 0xff) @@ -1330,7 +1301,7 @@ errorhandler=None): size = len(s) if size == 0: - return '', 0, 0, rutf8.FLAG_ASCII + return '', 0, 0 unicode_bytes = 4 if BYTEORDER == "little": @@ -1367,8 +1338,8 @@ rutf8.unichr_as_utf8_append(result, intmask(t), allow_surrogates=True) pos += unicode_bytes r = result.build() - lgt, flag = rutf8.check_utf8(r, True) - return r, pos, lgt, flag + lgt = rutf8.check_utf8(r, True) + return r, pos, lgt def utf8_encode_unicode_internal(s, errors, errorhandler): size = len(s) @@ -1409,7 +1380,7 @@ errorhandler=errorhandler) size = len(s) if size == 0: - return '', 0, 0, rutf8.FLAG_ASCII + return '', 0, 0 pos = 0 result = StringBuilder(size) @@ -1426,8 +1397,8 @@ result.append(c) pos += 1 r = result.build() - lgt, flag = rutf8.check_utf8(r, True) - return r, pos, lgt, flag + lgt = rutf8.check_utf8(r, True) + return r, pos, lgt def utf8_encode_charmap(s, errors, errorhandler=None, mapping=None): diff --git a/pypy/module/__builtin__/operation.py b/pypy/module/__builtin__/operation.py --- a/pypy/module/__builtin__/operation.py +++ b/pypy/module/__builtin__/operation.py @@ -26,14 +26,8 @@ "Return a Unicode string of one character with the given ordinal." if code < 0 or code > 0x10FFFF: raise oefmt(space.w_ValueError, "unichr() arg out of range") - elif code < 0x80: - flag = rutf8.FLAG_ASCII - elif 0xD800 <= code <= 0xDFFF: - flag = rutf8.FLAG_HAS_SURROGATES - else: - flag = rutf8.FLAG_REGULAR s = rutf8.unichr_as_utf8(code, allow_surrogates=True) - return space.newutf8(s, 1, flag) + return space.newutf8(s, 1) def len(space, w_obj): "len(object) -> integer\n\nReturn the number of items of a sequence or mapping." diff --git a/pypy/module/_cffi_backend/ctypeprim.py b/pypy/module/_cffi_backend/ctypeprim.py --- a/pypy/module/_cffi_backend/ctypeprim.py +++ b/pypy/module/_cffi_backend/ctypeprim.py @@ -183,8 +183,7 @@ raise oefmt(self.space.w_ValueError, "%s out of range for conversion to unicode: %s", self.name, s) - flag = rutf8.get_flag_from_code(intmask(value)) - return self.space.newutf8(utf8, 1, flag) + return self.space.newutf8(utf8, 1) def string(self, cdataobj, maxlen): with cdataobj as ptr: @@ -215,15 +214,15 @@ def unpack_ptr(self, w_ctypeptr, ptr, length): if self.size == 2: - utf8, lgt, flag = wchar_helper.utf8_from_char16(ptr, length) + utf8, lgt = wchar_helper.utf8_from_char16(ptr, length) else: try: - utf8, lgt, flag = wchar_helper.utf8_from_char32(ptr, length) + utf8, lgt = wchar_helper.utf8_from_char32(ptr, length) except wchar_helper.OutOfRange as e: raise oefmt(self.space.w_ValueError, "%s out of range for conversion to unicode: %s", self.name, hex(e.ordinal)) - return self.space.newutf8(utf8, lgt, flag) + return self.space.newutf8(utf8, lgt) class W_CTypePrimitiveSigned(W_CTypePrimitive): diff --git a/pypy/module/_cffi_backend/wchar_helper.py b/pypy/module/_cffi_backend/wchar_helper.py --- a/pypy/module/_cffi_backend/wchar_helper.py +++ b/pypy/module/_cffi_backend/wchar_helper.py @@ -19,16 +19,14 @@ ptr = rffi.cast(rffi.UINTP, ptr) u = StringBuilder(length) j = 0 - flag = rutf8.FLAG_ASCII while j < length: ch = intmask(ptr[j]) j += 1 - flag = rutf8.combine_flags(flag, rutf8.get_flag_from_code(ch)) try: rutf8.unichr_as_utf8_append(u, ch, allow_surrogates=True) except ValueError: raise OutOfRange(ch) - return u.build(), length, flag + return u.build(), length def utf8_from_char16(ptr, length): # 'ptr' is a pointer to 'length' 16-bit integers @@ -36,7 +34,6 @@ u = StringBuilder(length) j = 0 result_length = length - flag = rutf8.FLAG_ASCII while j < length: ch = intmask(ptr[j]) j += 1 @@ -46,9 +43,8 @@ ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000 j += 1 result_length -= 1 - flag = rutf8.combine_flags(flag, rutf8.get_flag_from_code(ch)) rutf8.unichr_as_utf8_append(u, ch, allow_surrogates=True) - return u.build(), result_length, flag + return u.build(), result_length @specialize.ll() diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py --- a/pypy/module/_codecs/interp_codecs.py +++ b/pypy/module/_codecs/interp_codecs.py @@ -43,8 +43,8 @@ length = len(input) else: w_cls = space.w_UnicodeEncodeError - length, flag = rutf8.check_utf8(input, allow_surrogates=True) - w_input = space.newutf8(input, length, flag) + length = rutf8.check_utf8(input, allow_surrogates=True) + w_input = space.newutf8(input, length) w_exc = space.call_function( w_cls, space.newtext(encoding), @@ -192,7 +192,7 @@ def ignore_errors(space, w_exc): check_exception(space, w_exc) w_end = space.getattr(w_exc, space.newtext('end')) - return space.newtuple([space.newutf8('', 0, rutf8.FLAG_ASCII), w_end]) + return space.newtuple([space.newutf8('', 0), w_end]) REPLACEMENT = u'\ufffd'.encode('utf8') @@ -203,13 +203,13 @@ size = space.int_w(w_end) - space.int_w(w_start) if space.isinstance_w(w_exc, space.w_UnicodeEncodeError): text = '?' * size - return space.newtuple([space.newutf8(text, size, rutf8.FLAG_ASCII), w_end]) + return space.newtuple([space.newutf8(text, size), w_end]) elif space.isinstance_w(w_exc, space.w_UnicodeDecodeError): text = REPLACEMENT - return space.newtuple([space.newutf8(text, 1, rutf8.FLAG_REGULAR), w_end]) + return space.newtuple([space.newutf8(text, 1), w_end]) elif space.isinstance_w(w_exc, space.w_UnicodeTranslateError): text = REPLACEMENT * size - return space.newtuple([space.newutf8(text, size, rutf8.FLAG_REGULAR), w_end]) + return space.newtuple([space.newutf8(text, size), w_end]) else: raise oefmt(space.w_TypeError, "don't know how to handle %T in error callback", w_exc) @@ -237,8 +237,8 @@ builder.append(";") pos = rutf8.next_codepoint_pos(obj, pos) r = builder.build() - lgt, flag = rutf8.check_utf8(r, True) - return space.newtuple([space.newutf8(r, lgt, flag), w_end]) + lgt = rutf8.check_utf8(r, True) + return space.newtuple([space.newutf8(r, lgt), w_end]) else: raise oefmt(space.w_TypeError, "don't know how to handle %T in error callback", w_exc) @@ -278,8 +278,8 @@ builder.append_slice(num, 2, lnum) pos = rutf8.next_codepoint_pos(obj, pos) r = builder.build() - lgt, flag = rutf8.check_utf8(r, True) - return space.newtuple([space.newutf8(r, lgt, flag), w_end]) + lgt = rutf8.check_utf8(r, True) + return space.newtuple([space.newutf8(r, lgt), w_end]) else: raise oefmt(space.w_TypeError, "don't know how to handle %T in error callback", w_exc) @@ -417,9 +417,9 @@ final = space.is_true(w_final) state = space.fromcache(CodecState) func = getattr(unicodehelper, rname) - result, consumed, length, flag = func(string, errors, + result, consumed, length = func(string, errors, final, state.decode_error_handler) - return space.newtuple([space.newutf8(result, length, flag), + return space.newtuple([space.newutf8(result, length), space.newint(consumed)]) wrap_decoder.func_name = rname globals()[name] = wrap_decoder @@ -488,14 +488,14 @@ state = space.fromcache(CodecState) # call the fast version for checking try: - lgt, flag = rutf8.check_utf8(string, allow_surrogates=True) + lgt = rutf8.check_utf8(string, allow_surrogates=True) except rutf8.CheckError: - res, consumed, lgt, flag = unicodehelper.str_decode_utf8(string, + res, consumed, lgt = unicodehelper.str_decode_utf8(string, errors, final, state.decode_error_handler) - return space.newtuple([space.newutf8(res, lgt, flag), + return space.newtuple([space.newutf8(res, lgt), space.newint(consumed)]) else: - return space.newtuple([space.newutf8(string, lgt, flag), + return space.newtuple([space.newutf8(string, lgt), space.newint(len(string))]) @unwrap_spec(data='bufferstr', errors='text_or_none', byteorder=int, @@ -516,10 +516,10 @@ consumed = len(data) if final: consumed = 0 - res, consumed, lgt, flag, byteorder = str_decode_utf_16_helper( + res, consumed, lgt, byteorder = str_decode_utf_16_helper( data, errors, final, state.decode_error_handler, byteorder) - return space.newtuple([space.newutf8(res, lgt, flag), + return space.newtuple([space.newutf8(res, lgt), space.newint(consumed), space.newint(byteorder)]) @@ -539,10 +539,10 @@ consumed = len(data) if final: consumed = 0 - res, consumed, lgt, flag, byteorder = str_decode_utf_32_helper( + res, consumed, lgt, byteorder = str_decode_utf_32_helper( data, errors, final, state.decode_error_handler, byteorder) - return space.newtuple([space.newutf8(res, lgt, flag), + return space.newtuple([space.newutf8(res, lgt), space.newint(consumed), space.newint(byteorder)]) @@ -632,7 +632,7 @@ if errors is None: errors = 'strict' if len(string) == 0: - return space.newtuple([space.newutf8('', 0, rutf8.FLAG_ASCII), + return space.newtuple([space.newutf8('', 0), space.newint(0)]) if space.is_none(w_mapping): @@ -642,9 +642,9 @@ final = True state = space.fromcache(CodecState) - result, consumed, lgt, flag = unicodehelper.str_decode_charmap( + result, consumed, lgt = unicodehelper.str_decode_charmap( string, errors, final, state.decode_error_handler, mapping) - return space.newtuple([space.newutf8(result, lgt, flag), + return space.newtuple([space.newutf8(result, lgt), space.newint(consumed)]) @unwrap_spec(errors='text_or_none') @@ -708,12 +708,12 @@ unicode_name_handler = state.get_unicodedata_handler(space) - result, consumed, lgt, flag = unicodehelper.str_decode_unicode_escape( + result, consumed, lgt = unicodehelper.str_decode_unicode_escape( string, errors, final, state.decode_error_handler, unicode_name_handler) - return space.newtuple([space.newutf8(result, lgt, flag), space.newint(consumed)]) + return space.newtuple([space.newutf8(result, lgt), space.newint(consumed)]) # ____________________________________________________________ # Unicode-internal @@ -731,15 +731,15 @@ string = space.readbuf_w(w_string).as_str() if len(string) == 0: - return space.newtuple([space.newutf8('', 0, rutf8.FLAG_ASCII), + return space.newtuple([space.newutf8('', 0), space.newint(0)]) final = True state = space.fromcache(CodecState) - result, consumed, lgt, flag = unicodehelper.str_decode_unicode_internal( + result, consumed, lgt = unicodehelper.str_decode_unicode_internal( string, errors, final, state.decode_error_handler) - return space.newtuple([space.newutf8(result, lgt, flag), + return space.newtuple([space.newutf8(result, lgt), space.newint(consumed)]) # ____________________________________________________________ diff --git a/pypy/module/_io/interp_textio.py b/pypy/module/_io/interp_textio.py --- a/pypy/module/_io/interp_textio.py +++ b/pypy/module/_io/interp_textio.py @@ -11,8 +11,8 @@ from rpython.rlib.rarithmetic import intmask, r_uint, r_ulonglong from rpython.rlib.rbigint import rbigint from rpython.rlib.rstring import StringBuilder -from rpython.rlib.rutf8 import ( - FLAG_ASCII, check_utf8, next_codepoint_pos, codepoints_in_utf8) +from rpython.rlib.rutf8 import (check_utf8, next_codepoint_pos, + codepoints_in_utf8) STATE_ZERO, STATE_OK, STATE_DETACHED = range(3) @@ -31,22 +31,22 @@ def __init__(self, space): self.w_newlines_dict = { - SEEN_CR: space.newutf8("\r", 1, FLAG_ASCII), - SEEN_LF: space.newutf8("\n", 1, FLAG_ASCII), - SEEN_CRLF: space.newutf8("\r\n", 2, FLAG_ASCII), + SEEN_CR: space.newutf8("\r", 1), + SEEN_LF: space.newutf8("\n", 1), + SEEN_CRLF: space.newutf8("\r\n", 2), SEEN_CR | SEEN_LF: space.newtuple( - [space.newutf8("\r", 1, FLAG_ASCII), - space.newutf8("\n", 1, FLAG_ASCII)]), + [space.newutf8("\r", 1), + space.newutf8("\n", 1)]), SEEN_CR | SEEN_CRLF: space.newtuple( - [space.newutf8("\r", 1, FLAG_ASCII), - space.newutf8("\r\n", 2, FLAG_ASCII)]), + [space.newutf8("\r", 1), + space.newutf8("\r\n", 2)]), SEEN_LF | SEEN_CRLF: space.newtuple( - [space.newutf8("\n", 1, FLAG_ASCII), - space.newutf8("\r\n", 2, FLAG_ASCII)]), + [space.newutf8("\n", 1), + space.newutf8("\r\n", 2)]), SEEN_CR | SEEN_LF | SEEN_CRLF: space.newtuple( - [space.newutf8("\r", 1, FLAG_ASCII), - space.newutf8("\n", 1, FLAG_ASCII), - space.newutf8("\r\n", 2, FLAG_ASCII)]), + [space.newutf8("\r", 1), + space.newutf8("\n", 1), + space.newutf8("\r\n", 2)]), } @unwrap_spec(translate=int) @@ -98,7 +98,7 @@ output_len -= 1 if output_len == 0: - return space.newutf8("", 0, FLAG_ASCII) + return space.newutf8("", 0) # Record which newlines are read and do newline translation if # desired, all in one pass. @@ -153,8 +153,8 @@ output = builder.build() self.seennl |= seennl - lgt, flag = check_utf8(output, True) - return space.newutf8(output, lgt, flag) + lgt = check_utf8(output, True) + return space.newutf8(output, lgt) def reset_w(self, space): self.seennl = 0 @@ -361,6 +361,7 @@ while scanned < limit: try: ch = self.next_char() + scanned += 1 except StopIteration: return False if ch == '\n': @@ -746,7 +747,7 @@ remnant = None continue - if limit > 0: + if limit >= 0: remaining = limit - builder.getlength() assert remaining >= 0 else: diff --git a/pypy/module/_io/test/test_interp_textio.py b/pypy/module/_io/test/test_interp_textio.py --- a/pypy/module/_io/test/test_interp_textio.py +++ b/pypy/module/_io/test/test_interp_textio.py @@ -1,41 +1,54 @@ import pytest try: - from hypothesis import given, strategies as st, assume + from hypothesis import given, strategies as st except ImportError: pytest.skip("hypothesis required") +import os from pypy.module._io.interp_bytesio import W_BytesIO from pypy.module._io.interp_textio import W_TextIOWrapper, DecodeBuffer -LINESEP = ['', '\r', '\n', '\r\n'] +def translate_newlines(text): + text = text.replace(u'\r\n', u'\n') + text = text.replace(u'\r', u'\n') + return text.replace(u'\n', os.linesep) @st.composite -def text_with_newlines(draw): - sep = draw(st.sampled_from(LINESEP)) - lines = draw(st.lists(st.text(max_size=10), max_size=10)) - return sep.join(lines) +def st_readline(draw, st_nlines=st.integers(min_value=0, max_value=10)): + n_lines = draw(st_nlines) + fragments = [] + limits = [] + for _ in range(n_lines): + line = draw(st.text(st.characters(blacklist_characters=u'\r\n'))) + fragments.append(line) + ending = draw(st.sampled_from([u'\n', u'\r', u'\r\n'])) + fragments.append(ending) + limit = draw(st.integers(min_value=0, max_value=len(line) + 5)) + limits.append(limit) + limits.append(-1) + return (u''.join(fragments), limits) -@given(txt=text_with_newlines(), - mode=st.sampled_from(['\r', '\n', '\r\n', '']), - limit=st.integers(min_value=-1)) -def test_readline(space, txt, mode, limit): - assume(limit != 0) +@given(data=st_readline(), + mode=st.sampled_from(['\r', '\n', '\r\n', ''])) +def test_readline(space, data, mode): + txt, limits = data w_stream = W_BytesIO(space) w_stream.descr_init(space, space.newbytes(txt.encode('utf-8'))) w_textio = W_TextIOWrapper(space) w_textio.descr_init( - space, w_stream, encoding='utf-8', + space, w_stream, + encoding='utf-8', w_errors=space.newtext('surrogatepass'), w_newline=space.newtext(mode)) lines = [] - while True: + for limit in limits: w_line = w_textio.readline_w(space, space.newint(limit)) line = space.utf8_w(w_line).decode('utf-8') - if limit > 0: + if limit >= 0: assert len(line) <= limit if line: lines.append(line) - else: + elif limit: break - assert u''.join(lines) == txt + assert txt.startswith(u''.join(lines)) @given(st.text()) def test_read_buffer(text): diff --git a/pypy/module/_multibytecodec/interp_incremental.py b/pypy/module/_multibytecodec/interp_incremental.py --- a/pypy/module/_multibytecodec/interp_incremental.py +++ b/pypy/module/_multibytecodec/interp_incremental.py @@ -66,8 +66,8 @@ pos = c_codecs.pypy_cjk_dec_inbuf_consumed(self.decodebuf) assert 0 <= pos <= len(object) self.pending = object[pos:] - lgt, flag = rutf8.get_utf8_length_flag(output) - return space.newutf8(output, lgt, flag) + lgt = rutf8.get_utf8_length_flag(output) + return space.newutf8(output, lgt) @unwrap_spec(errors="text_or_none") diff --git a/pypy/module/_multibytecodec/interp_multibytecodec.py b/pypy/module/_multibytecodec/interp_multibytecodec.py --- a/pypy/module/_multibytecodec/interp_multibytecodec.py +++ b/pypy/module/_multibytecodec/interp_multibytecodec.py @@ -78,12 +78,11 @@ space.newtext(e.reason)])) def wrap_unicodeencodeerror(space, e, input, inputlen, name): - _, flag = rutf8.check_utf8(input, True) raise OperationError( space.w_UnicodeEncodeError, space.newtuple([ space.newtext(name), - space.newutf8(input, inputlen, flag), + space.newutf8(input, inputlen), space.newint(e.start), space.newint(e.end), space.newtext(e.reason)])) diff --git a/pypy/module/_pypyjson/interp_decoder.py b/pypy/module/_pypyjson/interp_decoder.py --- a/pypy/module/_pypyjson/interp_decoder.py +++ b/pypy/module/_pypyjson/interp_decoder.py @@ -295,15 +295,15 @@ if bits & 0x80: # the 8th bit is set, it's an utf8 string content_utf8 = self.getslice(start, end) - lgt, flag = unicodehelper.check_utf8_or_raise(self.space, + lgt = unicodehelper.check_utf8_or_raise(self.space, content_utf8) - return self.space.newutf8(content_utf8, lgt, flag) + return self.space.newutf8(content_utf8, lgt) else: # ascii only, fast path (ascii is a strict subset of # latin1, and we already checked that all the chars are < # 128) return self.space.newutf8(self.getslice(start, end), - end - start, rutf8.FLAG_ASCII) + end - start) def decode_string_escaped(self, start): i = self.pos @@ -316,10 +316,10 @@ i += 1 if ch == '"': content_utf8 = builder.build() - lgt, f = unicodehelper.check_utf8_or_raise(self.space, + lgt = unicodehelper.check_utf8_or_raise(self.space, content_utf8) self.pos = i - return self.space.newutf8(content_utf8, lgt, f) + return self.space.newutf8(content_utf8, lgt) elif ch == '\\': i = self.decode_escape_sequence(i, builder) elif ch < '\x20': diff --git a/pypy/module/_pypyjson/test/test__pypyjson.py b/pypy/module/_pypyjson/test/test__pypyjson.py --- a/pypy/module/_pypyjson/test/test__pypyjson.py +++ b/pypy/module/_pypyjson/test/test__pypyjson.py @@ -11,7 +11,7 @@ dec.close() class FakeSpace(object): - def newutf8(self, s, l, f): + def newutf8(self, s, l): return s def test_decode_key(): diff --git a/pypy/module/_rawffi/alt/type_converter.py b/pypy/module/_rawffi/alt/type_converter.py --- a/pypy/module/_rawffi/alt/type_converter.py +++ b/pypy/module/_rawffi/alt/type_converter.py @@ -228,8 +228,7 @@ return space.newbytes(chr(ucharval)) elif w_ffitype.is_unichar(): wcharval = self.get_unichar(w_ffitype) - return space.newutf8(rutf8.unichr_as_utf8(wcharval), 1, - rutf8.get_flag_from_code(intmask(wcharval))) + return space.newutf8(rutf8.unichr_as_utf8(wcharval), 1) elif w_ffitype.is_double(): return self._float(w_ffitype) elif w_ffitype.is_singlefloat(): diff --git a/pypy/module/_rawffi/interp_rawffi.py b/pypy/module/_rawffi/interp_rawffi.py --- a/pypy/module/_rawffi/interp_rawffi.py +++ b/pypy/module/_rawffi/interp_rawffi.py @@ -596,9 +596,9 @@ return space.w_None wcharp_addr = rffi.cast(rffi.CWCHARP, address) if maxlength == -1: - s = rffi.wcharp2unicode(wcharp_addr) + s = rffi.wcharp2utf8(wcharp_addr) else: - s = rffi.wcharp2unicoden(wcharp_addr, maxlength) + s = rffi.wcharpsize2utf8(wcharp_addr, maxlength) return space.newunicode(s) @unwrap_spec(address=r_uint, maxlength=int) diff --git a/pypy/module/_sre/interp_sre.py b/pypy/module/_sre/interp_sre.py --- a/pypy/module/_sre/interp_sre.py +++ b/pypy/module/_sre/interp_sre.py @@ -41,7 +41,9 @@ if isinstance(ctx, rsre_core.StrMatchContext): return space.newbytes(ctx._string[start:end]) elif isinstance(ctx, rsre_core.UnicodeMatchContext): - return space.newunicode(ctx._unicodestr[start:end]) + s = ctx._unicodestr[start:end] + lgt = rutf8.check_utf8(s, True) + return space.newutf8(s, lgt) else: # unreachable raise SystemError @@ -340,11 +342,10 @@ else: assert unicodebuilder is not None return space.newutf8(unicodebuilder.build(), - unicodebuilder.get_length(), - unicodebuilder.get_flag()), n + unicodebuilder.get_length()), n else: if space.isinstance_w(w_string, space.w_unicode): - w_emptystr = space.newunicode(u'') + w_emptystr = space.newutf8('', 0) else: w_emptystr = space.newbytes('') w_item = space.call_method(w_emptystr, 'join', @@ -578,7 +579,8 @@ elif isinstance(ctx, rsre_core.StrMatchContext): return space.newbytes(ctx._string) elif isinstance(ctx, rsre_core.UnicodeMatchContext): - return space.newunicode(ctx._unicodestr) + lgt = rutf8.check_utf8(ctx._unicodestr, True) + return space.newutf8(ctx._unicodestr, lgt) else: raise SystemError diff --git a/pypy/module/_warnings/interp_warnings.py b/pypy/module/_warnings/interp_warnings.py --- a/pypy/module/_warnings/interp_warnings.py +++ b/pypy/module/_warnings/interp_warnings.py @@ -1,3 +1,6 @@ + +from rpython.rlib import rutf8 + from pypy.interpreter.gateway import unwrap_spec, WrappedDefault from pypy.interpreter.error import OperationError, oefmt @@ -208,10 +211,11 @@ except OperationError as e: if e.async(space): raise - message = u"%s:%d: %s: %s\n" % (space.unicode_w(w_filename), lineno, - space.unicode_w(w_name), - space.unicode_w(w_text)) - w_message = space.newunicode(message) + message = "%s:%d: %s: %s\n" % (space.utf8_w(w_filename), lineno, + space.utf8_w(w_name), + space.utf8_w(w_text)) + lgt = rutf8.check_utf8(message, True) + w_message = space.newutf8(message, lgt) else: w_message = space.newtext(message) space.call_method(w_stderr, "write", w_message) diff --git a/pypy/module/array/interp_array.py b/pypy/module/array/interp_array.py --- a/pypy/module/array/interp_array.py +++ b/pypy/module/array/interp_array.py @@ -1,4 +1,4 @@ -from rpython.rlib import jit, rgc +from rpython.rlib import jit, rgc, rutf8 from rpython.rlib.buffer import RawBuffer from rpython.rlib.objectmodel import keepalive_until_here from rpython.rlib.rarithmetic import ovfcheck, widen @@ -451,7 +451,7 @@ """ if self.typecode == 'u': buf = rffi.cast(UNICODE_ARRAY, self._buffer_as_unsigned()) - return space.newunicode(rffi.wcharpsize2unicode(buf, self.len)) + return space.newutf8(rffi.wcharpsize2unicode(buf, self.len)) else: raise oefmt(space.w_ValueError, "tounicode() may only be called on type 'u' arrays") @@ -797,7 +797,7 @@ TypeCode(rffi.UINT, 'int_w', True) types = { 'c': TypeCode(lltype.Char, 'bytes_w', method=''), - 'u': TypeCode(lltype.UniChar, 'unicode_w', method=''), + 'u': TypeCode(lltype.UniChar, 'utf8_len_w', method=''), 'b': TypeCode(rffi.SIGNEDCHAR, 'int_w', True, True), 'B': TypeCode(rffi.UCHAR, 'int_w', True), 'h': TypeCode(rffi.SHORT, 'int_w', True, True), @@ -895,11 +895,17 @@ "unsigned %d-byte integer out of range", mytype.bytes) return rffi.cast(mytype.itemtype, item) - if mytype.unwrap == 'bytes_w' or mytype.unwrap == 'unicode_w': + if mytype.unwrap == 'bytes_w': if len(item) != 1: raise oefmt(space.w_TypeError, "array item must be char") item = item[0] return rffi.cast(mytype.itemtype, item) + if mytype.unwrap == 'utf8_len_w': + utf8, lgt = item + if lgt != 1: + raise oefmt(space.w_TypeError, "array item must be char") + uchar = rutf8.codepoint_at_pos(utf8, 0) + return rffi.cast(mytype.itemtype, uchar) # # "regular" case: it fits in an rpython integer (lltype.Signed) # or it is a float @@ -1007,7 +1013,8 @@ elif mytype.typecode == 'c': return space.newbytes(item) elif mytype.typecode == 'u': - return space.newunicode(item) + code = ord(item) + return space.newutf8(rutf8.unichr_as_utf8(code), 1) assert 0, "unreachable" # interface diff --git a/pypy/module/cpyext/test/test_codecs.py b/pypy/module/cpyext/test/test_codecs.py --- a/pypy/module/cpyext/test/test_codecs.py +++ b/pypy/module/cpyext/test/test_codecs.py @@ -11,5 +11,5 @@ w_encoded = space.call_method(w_encoder, 'encode', space.wrap(u'späm')) w_decoder = PyCodec_IncrementalDecoder(space, utf8, None) w_decoded = space.call_method(w_decoder, 'decode', w_encoded) - assert space.unwrap(w_decoded) == u'späm' + assert space.unicode_w(w_decoded) == u'späm' rffi.free_charp(utf8) diff --git a/pypy/module/cpyext/test/test_eval.py b/pypy/module/cpyext/test/test_eval.py --- a/pypy/module/cpyext/test/test_eval.py +++ b/pypy/module/cpyext/test/test_eval.py @@ -131,7 +131,7 @@ finally: rffi.free_charp(buf) w_a = space.getitem(w_globals, space.wrap("a")) - assert space.unwrap(w_a) == u'caf\xe9' + assert space.unicode_w(w_a) == u'caf\xe9' lltype.free(flags, flavor='raw') def test_run_file(self, space): diff --git a/pypy/module/cpyext/test/test_object.py b/pypy/module/cpyext/test/test_object.py --- a/pypy/module/cpyext/test/test_object.py +++ b/pypy/module/cpyext/test/test_object.py @@ -8,7 +8,7 @@ from pypy.module.cpyext.object import ( PyObject_IsTrue, PyObject_Not, PyObject_GetAttrString, PyObject_DelAttrString, PyObject_GetAttr, PyObject_DelAttr, - PyObject_GetItem, + PyObject_GetItem, PyObject_IsInstance, PyObject_IsSubclass, PyObject_AsFileDescriptor, PyObject_Hash, PyObject_Cmp, PyObject_Unicode ) @@ -209,9 +209,9 @@ PyObject_Cmp(space, w(u"\xe9"), w("\xe9"), ptr) def test_unicode(self, space, api): - assert space.unwrap(api.PyObject_Unicode(None)) == u"<NULL>" - assert space.unwrap(api.PyObject_Unicode(space.wrap([]))) == u"[]" - assert space.unwrap(api.PyObject_Unicode(space.wrap("e"))) == u"e" + assert space.unicode_w(api.PyObject_Unicode(None)) == u"<NULL>" + assert space.unicode_w(api.PyObject_Unicode(space.wrap([]))) == u"[]" + assert space.unicode_w(api.PyObject_Unicode(space.wrap("e"))) == u"e" with raises_w(space, UnicodeDecodeError): PyObject_Unicode(space, space.wrap("\xe9")) @@ -562,7 +562,7 @@ PyObject *a = PyTuple_GetItem(args, 0); PyObject *b = PyTuple_GetItem(args, 1); int res = PyObject_RichCompareBool(a, b, Py_EQ); - return PyLong_FromLong(res); + return PyLong_FromLong(res); """),]) a = float('nan') b = float('nan') diff --git a/pypy/module/cpyext/test/test_unicodeobject.py b/pypy/module/cpyext/test/test_unicodeobject.py --- a/pypy/module/cpyext/test/test_unicodeobject.py +++ b/pypy/module/cpyext/test/test_unicodeobject.py @@ -178,7 +178,7 @@ array = rffi.cast(rffi.CWCHARP, PyUnicode_AS_DATA(space, word)) array2 = PyUnicode_AS_UNICODE(space, word) array3 = PyUnicode_AsUnicode(space, word) - for (i, char) in enumerate(space.unwrap(word)): + for (i, char) in enumerate(space.unicode_w(word)): assert array[i] == char assert array2[i] == char assert array3[i] == char @@ -216,12 +216,12 @@ def test_fromstring(self, space): s = rffi.str2charp(u'sp\x09m'.encode("utf-8")) w_res = PyUnicode_FromString(space, s) - assert space.unwrap(w_res) == u'sp\x09m' + assert space.unicode_w(w_res) == u'sp\x09m' res = PyUnicode_FromStringAndSize(space, s, 4) w_res = from_ref(space, res) Py_DecRef(space, res) - assert space.unwrap(w_res) == u'sp\x09m' + assert space.unicode_w(w_res) == u'sp\x09m' rffi.free_charp(s) def test_unicode_resize(self, space): @@ -256,17 +256,17 @@ u = rffi.str2charp(u'sp\x134m'.encode("utf-8")) w_u = PyUnicode_DecodeUTF8(space, u, 5, None) assert space.type(w_u) is space.w_unicode - assert space.unwrap(w_u) == u'sp\x134m' + assert space.unicode_w(w_u) == u'sp\x134m' w_u = PyUnicode_DecodeUTF8(space, u, 2, None) assert space.type(w_u) is space.w_unicode - assert space.unwrap(w_u) == 'sp' + assert space.unicode_w(w_u) == 'sp' rffi.free_charp(u) def test_encode_utf8(self, space): u = rffi.unicode2wcharp(u'sp\x09m') w_s = PyUnicode_EncodeUTF8(space, u, 4, None) - assert space.unwrap(w_s) == u'sp\x09m'.encode('utf-8') + assert space.unicode_w(w_s) == u'sp\x09m'.encode('utf-8') rffi.free_wcharp(u) def test_encode_decimal(self, space): @@ -364,18 +364,18 @@ def test_fromobject(self, space): w_u = space.wrap(u'a') assert PyUnicode_FromObject(space, w_u) is w_u - assert space.unwrap( + assert space.unicode_w( PyUnicode_FromObject(space, space.wrap('test'))) == 'test' def test_decode(self, space): b_text = rffi.str2charp('caf\x82xx') b_encoding = rffi.str2charp('cp437') - assert space.unwrap( + assert space.unicode_w( PyUnicode_Decode(space, b_text, 4, b_encoding, None)) == u'caf\xe9' w_text = PyUnicode_FromEncodedObject(space, space.wrap("test"), b_encoding, None) assert space.isinstance_w(w_text, space.w_unicode) - assert space.unwrap(w_text) == "test" + assert space.unicode_w(w_text) == "test" with raises_w(space, TypeError): PyUnicode_FromEncodedObject(space, space.wrap(u"test"), @@ -391,7 +391,8 @@ u_text = u'abcdefg' s_text = space.str_w(PyUnicode_AsEncodedString(space, space.wrap(u_text), null_charp, null_charp)) b_text = rffi.str2charp(s_text) - assert space.unwrap(PyUnicode_Decode(space, b_text, len(s_text), null_charp, null_charp)) == u_text + assert space.unicode_w(PyUnicode_Decode( + space, b_text, len(s_text), null_charp, null_charp)) == u_text with raises_w(space, TypeError): PyUnicode_FromEncodedObject( space, space.wrap(u_text), null_charp, None) @@ -508,7 +509,7 @@ def test_concat(self, space): w_res = PyUnicode_Concat(space, space.wrap(u'a'), space.wrap(u'b')) - assert space.unwrap(w_res) == u'ab' + assert space.unicode_w(w_res) == u'ab' def test_copy(self, space): w_x = space.wrap(u"abcd\u0660") @@ -579,29 +580,30 @@ w_format = space.wrap(u'hi %s') w_args = space.wrap((u'test',)) w_formated = PyUnicode_Format(space, w_format, w_args) - assert space.unwrap(w_formated) == space.unwrap(space.mod(w_format, w_args)) + assert (space.unicode_w(w_formated) == + space.unicode_w(space.mod(w_format, w_args))) def test_join(self, space): w_sep = space.wrap(u'<sep>') w_seq = space.wrap([u'a', u'b']) w_joined = PyUnicode_Join(space, w_sep, w_seq) - assert space.unwrap(w_joined) == u'a<sep>b' + assert space.unicode_w(w_joined) == u'a<sep>b' def test_fromordinal(self, space): w_char = PyUnicode_FromOrdinal(space, 65) - assert space.unwrap(w_char) == u'A' + assert space.unicode_w(w_char) == u'A' w_char = PyUnicode_FromOrdinal(space, 0) - assert space.unwrap(w_char) == u'\0' + assert space.unicode_w(w_char) == u'\0' w_char = PyUnicode_FromOrdinal(space, 0xFFFF) - assert space.unwrap(w_char) == u'\uFFFF' + assert space.unicode_w(w_char) == u'\uFFFF' def test_replace(self, space): w_str = space.wrap(u"abababab") w_substr = space.wrap(u"a") w_replstr = space.wrap(u"z") - assert u"zbzbabab" == space.unwrap( + assert u"zbzbabab" == space.unicode_w( PyUnicode_Replace(space, w_str, w_substr, w_replstr, 2)) - assert u"zbzbzbzb" == space.unwrap( + assert u"zbzbzbzb" == space.unicode_w( PyUnicode_Replace(space, w_str, w_substr, w_replstr, -1)) def test_tailmatch(self, space): diff --git a/pypy/module/exceptions/interp_exceptions.py b/pypy/module/exceptions/interp_exceptions.py --- a/pypy/module/exceptions/interp_exceptions.py +++ b/pypy/module/exceptions/interp_exceptions.py @@ -126,7 +126,7 @@ return space.call_function(space.w_unicode, w_as_str) lgt = len(self.args_w) if lgt == 0: - return space.newunicode(u"") + return space.newutf8("", 0) if lgt == 1: return space.call_function(space.w_unicode, self.args_w[0]) else: @@ -719,7 +719,7 @@ def descr_init(self, space, w_encoding, w_object, w_start, w_end, w_reason): # typechecking space.realtext_w(w_encoding) - space.utf8_w(w_object) + space.realutf8_w(w_object) space.int_w(w_start) space.int_w(w_end) space.realtext_w(w_reason) diff --git a/pypy/module/operator/tscmp.py b/pypy/module/operator/tscmp.py --- a/pypy/module/operator/tscmp.py +++ b/pypy/module/operator/tscmp.py @@ -45,15 +45,15 @@ Note: If a and b are of different lengths, or if an error occurs, a timing attack could theoretically reveal information about the types and lengths of a and b--but not their values. + + XXX note that here the strings have to have the same length as UTF8, + not only as unicode. Not sure how to do better """ if (space.isinstance_w(w_a, space.w_unicode) and space.isinstance_w(w_b, space.w_unicode)): - a = space.unicode_w(w_a) - b = space.unicode_w(w_b) - with rffi.scoped_nonmoving_unicodebuffer(a) as a_buf: - with rffi.scoped_nonmoving_unicodebuffer(b) as b_buf: - result = pypy_tscmp_wide(a_buf, b_buf, len(a), len(b)) - return space.newbool(rffi.cast(lltype.Bool, result)) + a = space.utf8_w(w_a) + b = space.utf8_w(w_b) + return space.newbool(_compare_two_strings(a, b)) return compare_digest_buffer(space, w_a, w_b) @@ -68,7 +68,10 @@ a = a_buf.as_str() b = b_buf.as_str() + return space.newbool(_compare_two_strings(a, b)) + +def _compare_two_strings(a, b): with rffi.scoped_nonmovingbuffer(a) as a_buf: with rffi.scoped_nonmovingbuffer(b) as b_buf: result = pypy_tscmp(a_buf, b_buf, len(a), len(b)) - return space.newbool(rffi.cast(lltype.Bool, result)) + return rffi.cast(lltype.Bool, result) diff --git a/pypy/module/pyexpat/interp_pyexpat.py b/pypy/module/pyexpat/interp_pyexpat.py --- a/pypy/module/pyexpat/interp_pyexpat.py +++ b/pypy/module/pyexpat/interp_pyexpat.py @@ -478,8 +478,8 @@ # I suppose this is a valid utf8, but there is noone to check # and noone to catch an error either try: - lgt, flag = rutf8.check_utf8(s, True) - return space.newutf8(s, lgt, flag) + lgt = rutf8.check_utf8(s, True) + return space.newutf8(s, lgt) except rutf8.CheckError: from pypy.interpreter import unicodehelper # get the correct error msg diff --git a/pypy/module/unicodedata/test/test_hyp.py b/pypy/module/unicodedata/test/test_hyp.py --- a/pypy/module/unicodedata/test/test_hyp.py +++ b/pypy/module/unicodedata/test/test_hyp.py @@ -10,7 +10,7 @@ def normalize(s): w_s = space.newunicode(s) w_res = ucd.normalize(space, NF_code, w_s) - return space.unwrap(w_res) + return space.unicode_w(w_res) return normalize all_forms = ['NFC', 'NFD', 'NFKC', 'NFKD'] diff --git a/pypy/objspace/fake/objspace.py b/pypy/objspace/fake/objspace.py --- a/pypy/objspace/fake/objspace.py +++ b/pypy/objspace/fake/objspace.py @@ -209,7 +209,7 @@ def newbytes(self, x): return w_some_obj() - def newutf8(self, x, l, f): + def newutf8(self, x, l): return w_some_obj() def new_from_utf8(self, a): diff --git a/pypy/objspace/std/bytearrayobject.py b/pypy/objspace/std/bytearrayobject.py --- a/pypy/objspace/std/bytearrayobject.py +++ b/pypy/objspace/std/bytearrayobject.py @@ -195,11 +195,11 @@ w_dict = self.getdict(space) if w_dict is None: w_dict = space.w_None - s, _, lgt, flag = str_decode_latin_1(''.join(self.getdata()), 'strict', + s, _, lgt = str_decode_latin_1(''.join(self.getdata()), 'strict', True, None) return space.newtuple([ space.type(self), space.newtuple([ - space.newutf8(s, lgt, flag), space.newtext('latin-1')]), + space.newutf8(s, lgt), space.newtext('latin-1')]), w_dict]) @staticmethod diff --git a/pypy/objspace/std/dictmultiobject.py b/pypy/objspace/std/dictmultiobject.py --- a/pypy/objspace/std/dictmultiobject.py +++ b/pypy/objspace/std/dictmultiobject.py @@ -1197,7 +1197,7 @@ unerase = staticmethod(unerase) def wrap(self, unwrapped): - return self.space.newutf8(unwrapped, len(unwrapped), rutf8.FLAG_ASCII) + return self.space.newutf8(unwrapped, len(unwrapped)) def unwrap(self, wrapped): return self.space.utf8_w(wrapped) @@ -1239,7 +1239,7 @@ ## return self.space.newlist_bytes(self.listview_bytes(w_dict)) def wrapkey(space, key): - return space.newutf8(key, len(key), rutf8.FLAG_ASCII) + return space.newutf8(key, len(key)) ## @jit.look_inside_iff(lambda self, w_dict: ## w_dict_unrolling_heuristic(w_dict)) diff --git a/pypy/objspace/std/formatting.py b/pypy/objspace/std/formatting.py --- a/pypy/objspace/std/formatting.py +++ b/pypy/objspace/std/formatting.py @@ -198,8 +198,8 @@ if self.w_valuedict is None: raise oefmt(space.w_TypeError, "format requires a mapping") if do_unicode: - lgt, flag = rutf8.check_utf8(key, True) - w_key = space.newutf8(key, lgt, flag) + lgt = rutf8.check_utf8(key, True) + w_key = space.newutf8(key, lgt) else: w_key = space.newbytes(key) return space.getitem(self.w_valuedict, w_key) @@ -330,8 +330,7 @@ space = self.space if do_unicode: cp = rutf8.codepoint_at_pos(self.fmt, self.fmtpos - 1) - flag = rutf8.get_flag_from_code(cp) - w_s = space.newutf8(rutf8.unichr_as_utf8(cp), 1, flag) + w_s = space.newutf8(rutf8.unichr_as_utf8(cp), 1) else: cp = ord(self.fmt[self.fmtpos - 1]) w_s = space.newbytes(chr(cp)) @@ -513,8 +512,8 @@ formatter = UnicodeFormatter(space, fmt, values_w, w_valuedict) result = formatter.format() # this can force strings, not sure if it's a problem or not - lgt, flag = rutf8.check_utf8(result, True) - return space.newutf8(result, lgt, flag) + lgt = rutf8.check_utf8(result, True) + return space.newutf8(result, lgt) def mod_format(space, w_format, w_values, do_unicode=False): if space.isinstance_w(w_values, space.w_tuple): diff --git a/pypy/objspace/std/listobject.py b/pypy/objspace/std/listobject.py --- a/pypy/objspace/std/listobject.py +++ b/pypy/objspace/std/listobject.py @@ -1998,7 +1998,7 @@ def wrap(self, stringval): assert stringval is not None - return self.space.newutf8(stringval, len(stringval), rutf8.FLAG_ASCII) + return self.space.newutf8(stringval, len(stringval)) def unwrap(self, w_string): return self.space.utf8_w(w_string) diff --git a/pypy/objspace/std/marshal_impl.py b/pypy/objspace/std/marshal_impl.py --- a/pypy/objspace/std/marshal_impl.py +++ b/pypy/objspace/std/marshal_impl.py @@ -403,8 +403,8 @@ @unmarshaller(TYPE_UNICODE) def unmarshal_unicode(space, u, tc): arg = u.get_str() - length, flag = unicodehelper.check_utf8_or_raise(space, arg) - return space.newutf8(arg, length, flag) + length = unicodehelper.check_utf8_or_raise(space, arg) + return space.newutf8(arg, length) @marshaller(W_SetObject) def marshal_set(space, w_set, m): diff --git a/pypy/objspace/std/newformat.py b/pypy/objspace/std/newformat.py --- a/pypy/objspace/std/newformat.py +++ b/pypy/objspace/std/newformat.py @@ -51,8 +51,8 @@ if for_unicode: def wrap(self, u): - lgt, flag = rutf8.check_utf8(u, True) - return self.space.newutf8(u, lgt, flag) + lgt = rutf8.check_utf8(u, True) + return self.space.newutf8(u, lgt) else: def wrap(self, s): return self.space.newbytes(s) @@ -379,8 +379,8 @@ template = unicode_template_formatter(space, space.utf8_w(w_string)) r = template.build(args) - lgt, flag = rutf8.check_utf8(r, True) - return space.newutf8(r, lgt, flag) + lgt = rutf8.check_utf8(r, True) + return space.newutf8(r, lgt) else: template = str_template_formatter(space, space.bytes_w(w_string)) return space.newbytes(template.build(args)) @@ -416,8 +416,8 @@ if for_unicode: def wrap(self, u): - lgt, flag = rutf8.check_utf8(u, True) - return self.space.newutf8(u, lgt, flag) + lgt = rutf8.check_utf8(u, True) + return self.space.newutf8(u, lgt) else: def wrap(self, s): return self.space.newbytes(s) diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py --- a/pypy/objspace/std/objspace.py +++ b/pypy/objspace/std/objspace.py @@ -165,8 +165,8 @@ return self.newtext(x) if isinstance(x, unicode): x = x.encode('utf8') - lgt, flag = rutf8.check_utf8(x, True) - return self.newutf8(x, lgt, flag) + lgt = rutf8.check_utf8(x, True) + return self.newutf8(x, lgt) if isinstance(x, float): return W_FloatObject(x) if isinstance(x, W_Root): @@ -362,16 +362,10 @@ return self.w_None return self.newtext(s) - def newutf8(self, utf8s, length, flag): + def newutf8(self, utf8s, length): assert utf8s is not None assert isinstance(utf8s, str) - return W_UnicodeObject(utf8s, length, flag) - - def new_from_utf8(self, utf8s): - # XXX: kill me! - assert isinstance(utf8s, str) - length, flag = rutf8.check_utf8(utf8s, True) - return W_UnicodeObject(utf8s, length, flag) + return W_UnicodeObject(utf8s, length) def newfilename(self, s): assert isinstance(s, str) # on pypy3, this decodes the byte string diff --git a/pypy/objspace/std/setobject.py b/pypy/objspace/std/setobject.py --- a/pypy/objspace/std/setobject.py +++ b/pypy/objspace/std/setobject.py @@ -1291,7 +1291,7 @@ return self.space.utf8_w(w_item) def wrap(self, item): - return self.space.newutf8(item, len(item), rutf8.FLAG_ASCII) + return self.space.newutf8(item, len(item)) def iter(self, w_set): return UnicodeIteratorImplementation(self.space, self, w_set) @@ -1495,7 +1495,7 @@ def next_entry(self): for key in self.iterator: - return self.space.newutf8(key, len(key), rutf8.FLAG_ASCII) + return self.space.newutf8(key, len(key)) else: return None diff --git a/pypy/objspace/std/test/test_index.py b/pypy/objspace/std/test/test_index.py --- a/pypy/objspace/std/test/test_index.py +++ b/pypy/objspace/std/test/test_index.py @@ -265,8 +265,7 @@ class AppTest_UnicodeTestCase(SeqTestCase, StringTestCase): def setup_method(self, method): SeqTestCase.setup_method(self, method) - self.w_seq = self.space.newutf8("this is a test", len("this is a test"), - rutf8.FLAG_ASCII) + self.w_seq = self.space.newutf8("this is a test", len("this is a test")) self.w_const = self.space.appexec([], """(): return unicode""") diff --git a/pypy/objspace/std/test/test_lengthhint.py b/pypy/objspace/std/test/test_lengthhint.py --- a/pypy/objspace/std/test/test_lengthhint.py +++ b/pypy/objspace/std/test/test_lengthhint.py @@ -74,8 +74,7 @@ self._test_length_hint(self.space.wrap('P' * self.SIZE)) def test_unicode(self): - self._test_length_hint(self.space.newutf8('Y' * self.SIZE, self.SIZE, - rutf8.FLAG_ASCII)) + self._test_length_hint(self.space.newutf8('Y' * self.SIZE, self.SIZE)) def test_tuple(self): self._test_length_hint(self.space.wrap(tuple(self.ITEMS))) diff --git a/pypy/objspace/std/test/test_liststrategies.py b/pypy/objspace/std/test/test_liststrategies.py --- a/pypy/objspace/std/test/test_liststrategies.py +++ b/pypy/objspace/std/test/test_liststrategies.py @@ -600,9 +600,9 @@ def test_unicode(self): l1 = W_ListObject(self.space, [self.space.newbytes("eins"), self.space.newbytes("zwei")]) assert isinstance(l1.strategy, BytesListStrategy) - l2 = W_ListObject(self.space, [self.space.newutf8("eins", 4, 2), self.space.newutf8("zwei", 4, 2)]) + l2 = W_ListObject(self.space, [self.space.newutf8("eins", 4), self.space.newutf8("zwei", 4)]) assert isinstance(l2.strategy, UnicodeListStrategy) - l3 = W_ListObject(self.space, [self.space.newbytes("eins"), self.space.newutf8("zwei", 4, 2)]) + l3 = W_ListObject(self.space, [self.space.newbytes("eins"), self.space.newutf8("zwei", 4)]) assert isinstance(l3.strategy, ObjectListStrategy) def test_listview_bytes(self): diff --git a/pypy/objspace/std/test/test_obj.py b/pypy/objspace/std/test/test_obj.py --- a/pypy/objspace/std/test/test_obj.py +++ b/pypy/objspace/std/test/test_obj.py @@ -17,7 +17,7 @@ cls.w_cpython_apptest = space.wrap(option.runappdirect and not hasattr(sys, 'pypy_translation_info')) def w_unwrap_wrap_unicode(space, w_obj): - return space.newutf8(space.utf8_w(w_obj), w_obj._length, w_obj._get_flag()) + return space.newutf8(space.utf8_w(w_obj), w_obj._length) cls.w_unwrap_wrap_unicode = space.wrap(gateway.interp2app(w_unwrap_wrap_unicode)) def w_unwrap_wrap_str(space, w_obj): return space.wrap(space.str_w(w_obj)) diff --git a/pypy/objspace/std/test/test_unicodeobject.py b/pypy/objspace/std/test/test_unicodeobject.py --- a/pypy/objspace/std/test/test_unicodeobject.py +++ b/pypy/objspace/std/test/test_unicodeobject.py @@ -27,12 +27,12 @@ assert len(warnings) == 2 def test_listview_unicode(self): - w_str = self.space.newutf8('abcd', 4, rutf8.FLAG_ASCII) + w_str = self.space.newutf8('abcd', 4) assert self.space.listview_utf8(w_str) == list("abcd") _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit