Author: Matti Picus <matti.pi...@gmail.com> Branch: py3.5 Changeset: r96005:0bbb64dc7f98 Date: 2019-02-13 23:13 +0200 http://bitbucket.org/pypy/pypy/changeset/0bbb64dc7f98/
Log: merge unicode-utf8-py3 into py3.5 diff too long, truncating to 2000 out of 25065 lines diff --git a/.hgtags b/.hgtags --- a/.hgtags +++ b/.hgtags @@ -61,3 +61,9 @@ 9112c8071614108b1042bfef0713915107004d62 release-pypy2.7-v7.0.0 1f86f25937b6ae6c8b25236c35228fac587678bf release-pypy3.5-v7.0.0 dab365a465140aa79a5f3ba4db784c4af4d5c195 release-pypy3.6-v7.0.0 +9112c8071614108b1042bfef0713915107004d62 release-pypy2.7-v7.0.0 +c8805ee6d7846ca2722b106eeaa2f128c699aba3 release-pypy2.7-v7.0.0 +1f86f25937b6ae6c8b25236c35228fac587678bf release-pypy3.5-v7.0.0 +928a4f70d3de7d17449456946154c5da6e600162 release-pypy3.5-v7.0.0 +dab365a465140aa79a5f3ba4db784c4af4d5c195 release-pypy3.6-v7.0.0 +fb40f7a5524c77b80e6c468e087d621610137261 release-pypy3.6-v7.0.0 diff --git a/TODO b/TODO new file mode 100644 --- /dev/null +++ b/TODO @@ -0,0 +1,20 @@ +* find a better way to run "find" without creating the index storage, if one + if one is not already readily available (understand cost now, improve after merge) +* improve performance of splitlines (CF) +* think about cost of utf8 list strategy (CF) +* revisit why runicode import str_decode_utf_8_impl needed instead of runicode + import str_decode_utf_8 +* revisit remaining places in win32 where we do utf8.decode('utf-8'), they should work + directly with utf8 (can be converted via runicode.str_decode_utf_8 as well) + - rutf8.utf8_encode_mbcs + - unicodehelper.fsencode + - _winreg.interp_winreg +* remove 'assert not isinstance(*, unicode) +* add a flag that prevents support for unicode in rpython and enable it in PyPy (CF, Armin) +* convert all realunicode_w to unicode_w after we flush out all old uses of + unicode_w +* review all uses of W_Unicode.text_w, right now it is exactly W_Unicode.utf8_w. + It shoud only return valid utf8 (see 0be26dc39a59 which broke translation on + win32 and failed tests on linux64). Then we can use it in places like + _socket.interp_func.getaddrinfo instead of space.encode_unicode_object(w_port, + 'utf-8', 'strict') diff --git a/pypy/TODO b/pypy/TODO --- a/pypy/TODO +++ b/pypy/TODO @@ -1,6 +1,3 @@ -... - - antocuni's older TODO: * run coverage against the parser/astbuilder/astcompiler: it's probably full of @@ -11,3 +8,5 @@ * re-enable BUILD_LIST_FROM_ARG: see the comment in astcompiler/codegen.py in ast.ListComp.build_container + +* review use of std_decode_utf8, we probably do not want to be using it diff --git a/pypy/doc/release-v7.0.0.rst b/pypy/doc/release-v7.0.0.rst --- a/pypy/doc/release-v7.0.0.rst +++ b/pypy/doc/release-v7.0.0.rst @@ -39,7 +39,7 @@ The utf8 branch that changes internal representation of unicode to utf8 did not make it into the release, so there is still more goodness coming. -You can download the v6.0 releases here: +You can download the v7.0 releases here: http://pypy.org/download.html diff --git a/pypy/doc/whatsnew-head.rst b/pypy/doc/whatsnew-head.rst --- a/pypy/doc/whatsnew-head.rst +++ b/pypy/doc/whatsnew-head.rst @@ -5,6 +5,11 @@ .. this is a revision shortly after release-pypy-7.0.0 .. startrev: 481c69f7d81f +.. branch: zlib-copying-third-time-a-charm + +Make sure zlib decompressobjs have their streams deallocated immediately +on flush. + .. branch: zlib-copying-redux Fix calling copy on already-flushed compressobjs. @@ -15,7 +20,11 @@ as they do on CPython. -.. math-improvements +.. branch: math-improvements Improve performance of long operations where one of the operands fits into -an int. \ No newline at end of file +an int. + +.. branch: regalloc-playgrounds + +Improve register allocation in the JIT. diff --git a/pypy/doc/whatsnew-pypy2-5.10.0.rst b/pypy/doc/whatsnew-pypy2-5.10.0.rst --- a/pypy/doc/whatsnew-pypy2-5.10.0.rst +++ b/pypy/doc/whatsnew-pypy2-5.10.0.rst @@ -1,42 +1,42 @@ -========================== -What's new in PyPy2.7 5.10 -========================== - -.. this is a revision shortly after release-pypy2.7-v5.9.0 -.. startrev:d56dadcef996 - - -.. branch: cppyy-packaging - -Cleanup and improve cppyy packaging - -.. branch: docs-osx-brew-openssl - -.. branch: keep-debug-symbols - -Add a smartstrip tool, which can optionally keep the debug symbols in a -separate file, instead of just stripping them away. Use it in packaging - -.. branch: bsd-patches - -Fix failures on FreeBSD, contributed by David Naylor as patches on the issue -tracker (issues 2694, 2695, 2696, 2697) - -.. branch: run-extra-tests - -Run extra_tests/ in buildbot - -.. branch: vmprof-0.4.10 - -Upgrade the _vmprof backend to vmprof 0.4.10 - -.. branch: fix-vmprof-stacklet-switch -.. branch: fix-vmprof-stacklet-switch-2 - -Fix a vmprof+continulets (i.e. greenelts, eventlet, gevent, ...) - -.. branch: win32-vcvars - -.. branch: rdict-fast-hash - -Make it possible to declare that the hash function of an r_dict is fast in RPython. +========================== +What's new in PyPy2.7 5.10 +========================== + +.. this is a revision shortly after release-pypy2.7-v5.9.0 +.. startrev:d56dadcef996 + + +.. branch: cppyy-packaging + +Cleanup and improve cppyy packaging + +.. branch: docs-osx-brew-openssl + +.. branch: keep-debug-symbols + +Add a smartstrip tool, which can optionally keep the debug symbols in a +separate file, instead of just stripping them away. Use it in packaging + +.. branch: bsd-patches + +Fix failures on FreeBSD, contributed by David Naylor as patches on the issue +tracker (issues 2694, 2695, 2696, 2697) + +.. branch: run-extra-tests + +Run extra_tests/ in buildbot + +.. branch: vmprof-0.4.10 + +Upgrade the _vmprof backend to vmprof 0.4.10 + +.. branch: fix-vmprof-stacklet-switch +.. branch: fix-vmprof-stacklet-switch-2 + +Fix a vmprof+continulets (i.e. greenelts, eventlet, gevent, ...) + +.. branch: win32-vcvars + +.. branch: rdict-fast-hash + +Make it possible to declare that the hash function of an r_dict is fast in RPython. diff --git a/pypy/doc/whatsnew-pypy2-6.0.0.rst b/pypy/doc/whatsnew-pypy2-6.0.0.rst --- a/pypy/doc/whatsnew-pypy2-6.0.0.rst +++ b/pypy/doc/whatsnew-pypy2-6.0.0.rst @@ -1,132 +1,128 @@ -=========================== -What's new in PyPy2.7 5.10+ -=========================== - -.. this is a revision shortly after release-pypy2.7-v5.10.0 -.. startrev: 6b024edd9d12 - -.. branch: cpyext-avoid-roundtrip - -Big refactoring of some cpyext code, which avoids a lot of nonsense when -calling C from Python and vice-versa: the result is a big speedup in -function/method calls, up to 6 times faster. - -.. branch: cpyext-datetime2 - -Support ``tzinfo`` field on C-API datetime objects, fixes latest pandas HEAD - - -.. branch: mapdict-size-limit - -Fix a corner case of mapdict: When an instance is used like a dict (using -``setattr`` and ``getattr``, or ``.__dict__``) and a lot of attributes are -added, then the performance using mapdict is linear in the number of -attributes. This is now fixed (by switching to a regular dict after 80 -attributes). - - -.. branch: cpyext-faster-arg-passing - -When using cpyext, improve the speed of passing certain objects from PyPy to C -code, most notably None, True, False, types, all instances of C-defined types. -Before, a dict lookup was needed every time such an object crossed over, now it -is just a field read. - - -.. branch: 2634_datetime_timedelta_performance - -Improve datetime + timedelta performance. - -.. branch: memory-accounting - -Improve way to describe memory - -.. branch: msvc14 - -Allow compilaiton with Visual Studio 2017 compiler suite on windows - -.. branch: winapi - -Update _winapi and internal _winbase_cffi (via _winbase_build) for python 3 - -.. branch: refactor-slots - -Refactor cpyext slots. - - -.. branch: call-loopinvariant-into-bridges - -Speed up branchy code that does a lot of function inlining by saving one call -to read the TLS in most bridges. - -.. branch: rpython-sprint - -Refactor in rpython signatures - -.. branch: cpyext-tls-operror2 - -Store error state thread-locally in executioncontext, fixes issue #2764 - -.. branch: cpyext-fast-typecheck - -Optimize `Py*_Check` for `Bool`, `Float`, `Set`. Also refactor and simplify -`W_PyCWrapperObject` which is used to call slots from the C-API, greatly -improving microbenchmarks in https://github.com/antocuni/cpyext-benchmarks - - -.. branch: fix-sre-problems - -Fix two (unrelated) JIT bugs manifesting in the re module: - -- green fields are broken and were thus disabled, plus their usage removed from - the _sre implementation - -- in rare "trace is too long" situations, the JIT could break behaviour - arbitrarily. - -.. branch: jit-hooks-can-be-disabled - -Be more efficient about JIT hooks. Make it possible for the frontend to declare -that jit hooks are currently not enabled at all. in that case, the list of ops -does not have to be created in the case of the on_abort hook (which is -expensive). - - -.. branch: pyparser-improvements - -Improve speed of Python parser, improve ParseError messages slightly. - -.. branch: ioctl-arg-size - -Work around possible bugs in upstream ioctl users, like CPython allocate at -least 1024 bytes for the arg in calls to ``ioctl(fd, request, arg)``. Fixes -issue #2776 - -.. branch: cpyext-subclass-setattr - -Fix for python-level classes that inherit from C-API types, previously the -`w_obj` was not necessarily preserved throughout the lifetime of the `pyobj` -which led to cases where instance attributes were lost. Fixes issue #2793 - - -.. branch: pyparser-improvements-2 - -Improve line offsets that are reported by SyntaxError. Improve error messages -for a few situations, including mismatched parenthesis. - -.. branch: issue2752 - -Fix a rare GC bug that was introduced more than one year ago, but was -not diagnosed before issue #2752. - -.. branch: gc-hooks - -Introduce GC hooks, as documented in doc/gc_info.rst - -.. branch: gc-hook-better-timestamp - -Improve GC hooks - -.. branch: cppyy-packaging - -Update backend to 0.6.0 and support exceptions through wrappers +=========================== +What's new in PyPy2.7 5.10+ +=========================== + +.. this is a revision shortly after release-pypy2.7-v5.10.0 +.. startrev: 6b024edd9d12 + +.. branch: cpyext-avoid-roundtrip + +Big refactoring of some cpyext code, which avoids a lot of nonsense when +calling C from Python and vice-versa: the result is a big speedup in +function/method calls, up to 6 times faster. + +.. branch: cpyext-datetime2 + +Support ``tzinfo`` field on C-API datetime objects, fixes latest pandas HEAD + + +.. branch: mapdict-size-limit + +Fix a corner case of mapdict: When an instance is used like a dict (using +``setattr`` and ``getattr``, or ``.__dict__``) and a lot of attributes are +added, then the performance using mapdict is linear in the number of +attributes. This is now fixed (by switching to a regular dict after 80 +attributes). + + +.. branch: cpyext-faster-arg-passing + +When using cpyext, improve the speed of passing certain objects from PyPy to C +code, most notably None, True, False, types, all instances of C-defined types. +Before, a dict lookup was needed every time such an object crossed over, now it +is just a field read. + + +.. branch: 2634_datetime_timedelta_performance + +Improve datetime + timedelta performance. + +.. branch: memory-accounting + +Improve way to describe memory + +.. branch: msvc14 + +Allow compilaiton with Visual Studio 2017 compiler suite on windows + +.. branch: refactor-slots + +Refactor cpyext slots. + + +.. branch: call-loopinvariant-into-bridges + +Speed up branchy code that does a lot of function inlining by saving one call +to read the TLS in most bridges. + +.. branch: rpython-sprint + +Refactor in rpython signatures + +.. branch: cpyext-tls-operror2 + +Store error state thread-locally in executioncontext, fixes issue #2764 + +.. branch: cpyext-fast-typecheck + +Optimize `Py*_Check` for `Bool`, `Float`, `Set`. Also refactor and simplify +`W_PyCWrapperObject` which is used to call slots from the C-API, greatly +improving microbenchmarks in https://github.com/antocuni/cpyext-benchmarks + + +.. branch: fix-sre-problems + +Fix two (unrelated) JIT bugs manifesting in the re module: + +- green fields are broken and were thus disabled, plus their usage removed from + the _sre implementation + +- in rare "trace is too long" situations, the JIT could break behaviour + arbitrarily. + +.. branch: jit-hooks-can-be-disabled + +Be more efficient about JIT hooks. Make it possible for the frontend to declare +that jit hooks are currently not enabled at all. in that case, the list of ops +does not have to be created in the case of the on_abort hook (which is +expensive). + + +.. branch: pyparser-improvements + +Improve speed of Python parser, improve ParseError messages slightly. + +.. branch: ioctl-arg-size + +Work around possible bugs in upstream ioctl users, like CPython allocate at +least 1024 bytes for the arg in calls to ``ioctl(fd, request, arg)``. Fixes +issue #2776 + +.. branch: cpyext-subclass-setattr + +Fix for python-level classes that inherit from C-API types, previously the +`w_obj` was not necessarily preserved throughout the lifetime of the `pyobj` +which led to cases where instance attributes were lost. Fixes issue #2793 + + +.. branch: pyparser-improvements-2 + +Improve line offsets that are reported by SyntaxError. Improve error messages +for a few situations, including mismatched parenthesis. + +.. branch: issue2752 + +Fix a rare GC bug that was introduced more than one year ago, but was +not diagnosed before issue #2752. + +.. branch: gc-hooks + +Introduce GC hooks, as documented in doc/gc_info.rst + +.. branch: gc-hook-better-timestamp + +Improve GC hooks + +.. branch: cppyy-packaging + +Update backend to 0.6.0 and support exceptions through wrappers diff --git a/pypy/doc/whatsnew-pypy2-7.0.0.rst b/pypy/doc/whatsnew-pypy2-7.0.0.rst --- a/pypy/doc/whatsnew-pypy2-7.0.0.rst +++ b/pypy/doc/whatsnew-pypy2-7.0.0.rst @@ -1,69 +1,69 @@ -========================== -What's new in PyPy2.7 6.0+ -========================== - -.. this is a revision shortly after release-pypy-6.0.0 -.. startrev: e50e11af23f1 - -.. branch: cppyy-packaging - -Main items: vastly better template resolution and improved performance. In -detail: upgrade to backend 1.4, improved handling of templated methods and -functions (in particular automatic deduction of types), improved pythonization -interface, range of compatibility fixes for Python3, free functions now take -fast libffi path when possible, moves for strings (incl. from Python str), -easier/faster handling of std::vector by numpy, improved and faster object -identity preservation - -.. branch: socket_default_timeout_blockingness - -Make sure 'blocking-ness' of socket is set along with default timeout - -.. branch: crypt_h - -Include crypt.h for crypt() on Linux - -.. branch: gc-more-logging - -Log additional gc-minor and gc-collect-step info in the PYPYLOG - -.. branch: reverse-debugger - -The reverse-debugger branch has been merged. For more information, see -https://bitbucket.org/pypy/revdb - - -.. branch: pyparser-improvements-3 - -Small refactorings in the Python parser. - -.. branch: fix-readme-typo - -.. branch: avoid_shell_injection_in_shutil - -Backport CPython fix for possible shell injection issue in `distutils.spawn`, -https://bugs.python.org/issue34540 - -.. branch: cffi_dlopen_unicode - -Enable use of unicode file names in `dlopen` - -.. branch: rlock-in-rpython - -Backport CPython fix for `thread.RLock` - - -.. branch: expose-gc-time - -Make GC hooks measure time in seconds (as opposed to an opaque unit). - -.. branch: cleanup-test_lib_pypy - -Update most test_lib_pypy/ tests and move them to extra_tests/. - -.. branch: gc-disable - -Make it possible to manually manage the GC by using a combination of -gc.disable() and gc.collect_step(). Make sure to write a proper release -announcement in which we explain that existing programs could leak memory if -they run for too much time between a gc.disable()/gc.enable() +========================== +What's new in PyPy2.7 6.0+ +========================== + +.. this is a revision shortly after release-pypy-6.0.0 +.. startrev: e50e11af23f1 + +.. branch: cppyy-packaging + +Main items: vastly better template resolution and improved performance. In +detail: upgrade to backend 1.4, improved handling of templated methods and +functions (in particular automatic deduction of types), improved pythonization +interface, range of compatibility fixes for Python3, free functions now take +fast libffi path when possible, moves for strings (incl. from Python str), +easier/faster handling of std::vector by numpy, improved and faster object +identity preservation + +.. branch: socket_default_timeout_blockingness + +Make sure 'blocking-ness' of socket is set along with default timeout + +.. branch: crypt_h + +Include crypt.h for crypt() on Linux + +.. branch: gc-more-logging + +Log additional gc-minor and gc-collect-step info in the PYPYLOG + +.. branch: reverse-debugger + +The reverse-debugger branch has been merged. For more information, see +https://bitbucket.org/pypy/revdb + + +.. branch: pyparser-improvements-3 + +Small refactorings in the Python parser. + +.. branch: fix-readme-typo + +.. branch: avoid_shell_injection_in_shutil + +Backport CPython fix for possible shell injection issue in `distutils.spawn`, +https://bugs.python.org/issue34540 + +.. branch: cffi_dlopen_unicode + +Enable use of unicode file names in `dlopen` + +.. branch: rlock-in-rpython + +Backport CPython fix for `thread.RLock` + + +.. branch: expose-gc-time + +Make GC hooks measure time in seconds (as opposed to an opaque unit). + +.. branch: cleanup-test_lib_pypy + +Update most test_lib_pypy/ tests and move them to extra_tests/. + +.. branch: gc-disable + +Make it possible to manually manage the GC by using a combination of +gc.disable() and gc.collect_step(). Make sure to write a proper release +announcement in which we explain that existing programs could leak memory if +they run for too much time between a gc.disable()/gc.enable() diff --git a/pypy/doc/whatsnew-pypy3-5.10.0.rst b/pypy/doc/whatsnew-pypy3-5.10.0.rst --- a/pypy/doc/whatsnew-pypy3-5.10.0.rst +++ b/pypy/doc/whatsnew-pypy3-5.10.0.rst @@ -1,21 +1,7 @@ -========================= -What's new in PyPy3 5.9+ -========================= - -.. this is the revision after release-pypy3.5-5.9 -.. startrev: be41e3ac0a29 - -.. branch: sched_yield -Add sched_yield posix attribute - -.. branch: py3.5-appexec -Raise if space.is_true(space.appexec()) used in app level tests, fix tests -that did this - -.. branch: py3.5-mac-embedding -Download and patch dependencies when building cffi-based stdlib modules - -.. branch: os_lockf - -.. branch: py3.5-xattr -Add posix.*attr() functions +======================== +What's new in PyPy3 7.0+ +======================== + +.. this is the revision after release-pypy3.5-v7.0 +.. startrev: 9d2fa7c63b7c + diff --git a/pypy/doc/whatsnew-pypy3-6.0.0.rst b/pypy/doc/whatsnew-pypy3-6.0.0.rst --- a/pypy/doc/whatsnew-pypy3-6.0.0.rst +++ b/pypy/doc/whatsnew-pypy3-6.0.0.rst @@ -1,28 +1,7 @@ -========================= -What's new in PyPy3 5.10+ -========================= +======================== +What's new in PyPy3 7.0+ +======================== -.. this is the revision after release-pypy3.5-v5.10 -.. startrev: 34c63fba0bba +.. this is the revision after release-pypy3.5-v7.0 +.. startrev: 9d2fa7c63b7c -.. branch: hroncok/fix-typeerror-str-does-not-support-the-b-1514414905375 - -Fix for bytestrings in console repl - -.. branch: py3-winreg - -Update winreg module to use unicode, wide-strings - -.. branch: cpyext-py3-instancemethod-attributes - -Add missing ``__doc__``, ``__module__``, ``__name__`` attributes to -``instancemethod`` - -.. branch: winapi - -Update support for _winapi cffi module for python3 - -.. branch: py3.5-refactor-slots - -Refactor cpyext slots. - diff --git a/pypy/doc/whatsnew-pypy3-7.0.0.rst b/pypy/doc/whatsnew-pypy3-7.0.0.rst --- a/pypy/doc/whatsnew-pypy3-7.0.0.rst +++ b/pypy/doc/whatsnew-pypy3-7.0.0.rst @@ -5,15 +5,10 @@ .. this is the revision after release-pypy3.5-v6.0 .. startrev: 580e3e26cd32 -.. branch: hroncok/fix-multiprocessing-regression-on-newer--1524656522151 +.. branch: unicode-utf8 -Fix multiprocessing regression on newer glibcs +Use utf-8 internally to represent unicode strings -.. branch: py3.5-user-site-impl +.. branch: unicode-utf8-py3 -Use implementation-specific site directories in sysconfig like in Python2 - -.. branch: py3.5-reverse-debugger - -The reverse-debugger branch has been merged. For more information, see -https://bitbucket.org/pypy/revdb +Use utf-8 internally to represent unicode strings diff --git a/pypy/goal/targetpypystandalone.py b/pypy/goal/targetpypystandalone.py --- a/pypy/goal/targetpypystandalone.py +++ b/pypy/goal/targetpypystandalone.py @@ -83,7 +83,7 @@ ## con.interact() except OperationError as e: debug("OperationError:") - debug(" operror-type: " + e.w_type.getname(space).encode('utf-8')) + debug(" operror-type: " + e.w_type.getname(space)) debug(" operror-value: " + space.text_w(space.str(e.get_w_value(space)))) return 1 finally: @@ -91,7 +91,7 @@ space.finish() except OperationError as e: debug("OperationError:") - debug(" operror-type: " + e.w_type.getname(space).encode('utf-8')) + debug(" operror-type: " + e.w_type.getname(space)) debug(" operror-value: " + space.text_w(space.str(e.get_w_value(space)))) return 1 return exitcode @@ -148,7 +148,7 @@ except OperationError as e: if verbose: debug("OperationError:") - debug(" operror-type: " + e.w_type.getname(space).encode('utf-8')) + debug(" operror-type: " + e.w_type.getname(space)) debug(" operror-value: " + space.text_w(space.str(e.get_w_value(space)))) return rffi.cast(rffi.INT, -1) finally: @@ -202,7 +202,7 @@ """) except OperationError as e: debug("OperationError:") - debug(" operror-type: " + e.w_type.getname(space).encode('utf-8')) + debug(" operror-type: " + e.w_type.getname(space)) debug(" operror-value: " + space.text_w(space.str(e.get_w_value(space)))) return -1 return 0 diff --git a/pypy/interpreter/argument.py b/pypy/interpreter/argument.py --- a/pypy/interpreter/argument.py +++ b/pypy/interpreter/argument.py @@ -596,6 +596,10 @@ except IndexError: name = '?' else: + w_enc = space.newtext(space.sys.defaultencoding) + w_err = space.newtext("replace") + w_name = space.call_method(w_name, "encode", w_enc, + w_err) name = space.text_w(w_name) break self.kwd_name = name diff --git a/pypy/interpreter/astcompiler/astbuilder.py b/pypy/interpreter/astcompiler/astbuilder.py --- a/pypy/interpreter/astcompiler/astbuilder.py +++ b/pypy/interpreter/astcompiler/astbuilder.py @@ -58,6 +58,7 @@ self.space = space self.compile_info = compile_info self.root_node = n + # used in f-strings self.recursive_parser = recursive_parser def build_ast(self): diff --git a/pypy/interpreter/astcompiler/fstring.py b/pypy/interpreter/astcompiler/fstring.py --- a/pypy/interpreter/astcompiler/fstring.py +++ b/pypy/interpreter/astcompiler/fstring.py @@ -3,6 +3,7 @@ from pypy.interpreter import error from pypy.interpreter import unicodehelper from rpython.rlib.rstring import StringBuilder +from rpython.rlib.rutf8 import codepoints_in_utf8 def add_constant_string(astbuilder, joined_pieces, w_string, atom_node): @@ -21,10 +22,8 @@ joined_pieces.append(node(w_string, atom_node.get_lineno(), atom_node.get_column())) -def f_constant_string(astbuilder, joined_pieces, u, atom_node): - space = astbuilder.space - add_constant_string(astbuilder, joined_pieces, space.newunicode(u), - atom_node) +def f_constant_string(astbuilder, joined_pieces, w_u, atom_node): + add_constant_string(astbuilder, joined_pieces, w_u, atom_node) def f_string_compile(astbuilder, source, atom_node): # Note: a f-string is kept as a single literal up to here. @@ -259,20 +258,20 @@ i += 1 fstr.current_index = i + space = astbuilder.space literal = builder.build() + lgt = codepoints_in_utf8(literal) if not fstr.raw_mode and '\\' in literal: - space = astbuilder.space literal = parsestring.decode_unicode_utf8(space, literal, 0, len(literal)) - return unicodehelper.decode_unicode_escape(space, literal) - else: - return literal.decode('utf-8') + literal, lgt, pos = unicodehelper.decode_unicode_escape(space, literal) + return space.newtext(literal, lgt) def fstring_find_literal_and_expr(astbuilder, fstr, atom_node, rec): - # Return a tuple with the next literal part, and optionally the + # Return a tuple with the next literal part as a W_Unicode, and optionally the # following expression node. Updates the current index inside 'fstr'. - literal = fstring_find_literal(astbuilder, fstr, atom_node, rec) + w_u = fstring_find_literal(astbuilder, fstr, atom_node, rec) s = fstr.unparsed i = fstr.current_index @@ -284,7 +283,7 @@ # We must now be the start of an expression, on a '{'. assert s[i] == '{' expr = fstring_find_expr(astbuilder, fstr, atom_node, rec) - return literal, expr + return w_u, expr def parse_f_string(astbuilder, joined_pieces, fstr, atom_node, rec=0): @@ -303,11 +302,11 @@ "really the case", atom_node) while True: - literal, expr = fstring_find_literal_and_expr(astbuilder, fstr, + w_u, expr = fstring_find_literal_and_expr(astbuilder, fstr, atom_node, rec) # add the literal part - f_constant_string(astbuilder, joined_pieces, literal, atom_node) + f_constant_string(astbuilder, joined_pieces, w_u, atom_node) if expr is None: break # We're done with this f-string. diff --git a/pypy/interpreter/astcompiler/misc.py b/pypy/interpreter/astcompiler/misc.py --- a/pypy/interpreter/astcompiler/misc.py +++ b/pypy/interpreter/astcompiler/misc.py @@ -112,7 +112,7 @@ # only intern identifier-like strings from pypy.objspace.std.unicodeobject import _isidentifier if (space.is_w(space.type(w_const), space.w_unicode) and - _isidentifier(space.unicode_w(w_const))): + _isidentifier(space.utf8_w(w_const))): return space.new_interned_w_str(w_const) return w_const diff --git a/pypy/interpreter/astcompiler/optimize.py b/pypy/interpreter/astcompiler/optimize.py --- a/pypy/interpreter/astcompiler/optimize.py +++ b/pypy/interpreter/astcompiler/optimize.py @@ -5,7 +5,7 @@ from pypy.tool import stdlib_opcode as ops from pypy.interpreter.error import OperationError from rpython.rlib.unroll import unrolling_iterable -from rpython.rlib.runicode import MAXUNICODE +from rpython.rlib.rutf8 import MAXUNICODE from rpython.rlib.objectmodel import specialize @@ -326,7 +326,7 @@ # produce compatible pycs. if (self.space.isinstance_w(w_obj, self.space.w_unicode) and self.space.isinstance_w(w_const, self.space.w_unicode)): - #unistr = self.space.unicode_w(w_const) + #unistr = self.space.utf8_w(w_const) #if len(unistr) == 1: # ch = ord(unistr[0]) #else: diff --git a/pypy/interpreter/astcompiler/test/test_astbuilder.py b/pypy/interpreter/astcompiler/test/test_astbuilder.py --- a/pypy/interpreter/astcompiler/test/test_astbuilder.py +++ b/pypy/interpreter/astcompiler/test/test_astbuilder.py @@ -902,7 +902,7 @@ def test_flufl(self): source = "x <> y" - raises(SyntaxError, self.get_ast, source) + py.test.raises(SyntaxError, self.get_ast, source) comp = self.get_first_expr(source, flags=consts.CO_FUTURE_BARRY_AS_BDFL) assert isinstance(comp, ast.Compare) @@ -1130,7 +1130,7 @@ s = self.get_first_expr("b'hi' b' implicitly' b' extra'") assert isinstance(s, ast.Bytes) assert space.eq_w(s.s, space.newbytes("hi implicitly extra")) - raises(SyntaxError, self.get_first_expr, "b'hello' 'world'") + py.test.raises(SyntaxError, self.get_first_expr, "b'hello' 'world'") sentence = u"Die Männer ärgern sich!" source = u"# coding: utf-7\nstuff = '%s'" % (sentence,) info = pyparse.CompileInfo("<test>", "exec") @@ -1325,8 +1325,8 @@ assert isinstance(if2, ast.Name) def test_cpython_issue12983(self): - raises(SyntaxError, self.get_ast, r"""b'\x'""") - raises(SyntaxError, self.get_ast, r"""b'\x0'""") + py.test.raises(SyntaxError, self.get_ast, r"""b'\x'""") + py.test.raises(SyntaxError, self.get_ast, r"""b'\x0'""") def test_matmul(self): mod = self.get_ast("a @ b") diff --git a/pypy/interpreter/astcompiler/test/test_compiler.py b/pypy/interpreter/astcompiler/test/test_compiler.py --- a/pypy/interpreter/astcompiler/test/test_compiler.py +++ b/pypy/interpreter/astcompiler/test/test_compiler.py @@ -1,5 +1,6 @@ from __future__ import division import py, sys +from pytest import raises from pypy.interpreter.astcompiler import codegen, astbuilder, symtable, optimize from pypy.interpreter.pyparser import pyparse from pypy.interpreter.pyparser.test import expressions @@ -76,7 +77,7 @@ space = self.space pyco_expr = PyCode._from_code(space, co_expr) w_res = pyco_expr.exec_host_bytecode(w_dict, w_dict) - res = space.str_w(space.repr(w_res)) + res = space.text_w(space.repr(w_res)) expected_repr = self.get_py3_repr(expected) if isinstance(expected, float): # Float representation can vary a bit between interpreter @@ -1249,7 +1250,6 @@ def test_revdb_metavar(self): from pypy.interpreter.reverse_debugging import dbstate, setup_revdb - self.space.config.translation.reverse_debugger = True self.space.reverse_debugging = True try: setup_revdb(self.space) @@ -1264,9 +1264,6 @@ class AppTestCompiler: - def setup_class(cls): - cls.w_maxunicode = cls.space.wrap(sys.maxunicode) - def test_docstring_not_loaded(self): import io, dis, sys ns = {} @@ -1428,7 +1425,7 @@ ''', d) return d['f'](5) """) - assert 'generator' in space.str_w(space.repr(w_generator)) + assert 'generator' in space.text_w(space.repr(w_generator)) def test_folding_of_list_constants(self): for source in ( diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py --- a/pypy/interpreter/baseobjspace.py +++ b/pypy/interpreter/baseobjspace.py @@ -3,7 +3,7 @@ from rpython.rlib.cache import Cache from rpython.tool.uid import HUGEVAL_BYTES -from rpython.rlib import jit, types +from rpython.rlib import jit, types, rutf8 from rpython.rlib.debug import make_sure_not_resized from rpython.rlib.objectmodel import (we_are_translated, newlist_hint, compute_unique_id, specialize, not_rpython) @@ -80,10 +80,10 @@ def getname(self, space): try: - return space.unicode_w(space.getattr(self, space.newtext('__name__'))) + return space.utf8_w(space.getattr(self, space.newtext('__name__'))) except OperationError as e: if e.match(space, space.w_TypeError) or e.match(space, space.w_AttributeError): - return u'?' + return '?' raise def getaddrstring(self, space): @@ -105,9 +105,9 @@ w_id = space.rshift(w_id, w_4) return ''.join(addrstring) - def getrepr(self, space, info, moreinfo=u''): - addrstring = unicode(self.getaddrstring(space)) - return space.newunicode(u"<%s at 0x%s%s>" % (info, addrstring, moreinfo)) + def getrepr(self, space, info, moreinfo=''): + addrstring = self.getaddrstring(space) + return space.newtext("<%s at 0x%s%s>" % (info, addrstring, moreinfo)) def getslotvalue(self, index): raise NotImplementedError @@ -245,11 +245,14 @@ def bytes_w(self, space): self._typed_unwrap_error(space, "bytes") - def unicode_w(self, space): - self._typed_unwrap_error(space, "string") + def text_w(self, space): + self._typed_unwrap_error(space, "unicode") - def text_w(self, space): - self._typed_unwrap_error(space, "string") + def utf8_w(self, space): + self._typed_unwrap_error(space, "unicode") + + def convert_to_w_unicode(self, space): + self._typed_unwrap_error(space, "unicode") def bytearray_list_of_chars_w(self, space): self._typed_unwrap_error(space, "bytearray") @@ -420,7 +423,7 @@ self.builtin_modules = {} self.reloading_modules = {} - self.interned_strings = make_weak_value_dictionary(self, unicode, W_Root) + self.interned_strings = make_weak_value_dictionary(self, str, W_Root) self.actionflag = ActionFlag() # changed by the signal module self.check_signal_action = None # changed by the signal module make_finalizer_queue(W_Root, self) @@ -781,12 +784,12 @@ def setitem_str(self, w_obj, key, w_value): # key is a "text", i.e. a byte string (in python3 it - # represents a utf-8-encoded unicode) + # represents a valid utf-8-encoded unicode) return self.setitem(w_obj, self.newtext(key), w_value) def finditem_str(self, w_obj, key): # key is a "text", i.e. a byte string (in python3 it - # represents a utf-8-encoded unicode) + # represents a valid utf-8-encoded unicode) return self.finditem(w_obj, self.newtext(key)) def finditem(self, w_obj, w_key): @@ -820,9 +823,9 @@ def new_interned_w_str(self, w_u): assert isinstance(w_u, W_Root) # and is not None - u = self.unicode_w(w_u) + u = self.utf8_w(w_u) if not we_are_translated(): - assert type(u) is unicode + assert type(u) is str w_u1 = self.interned_strings.get(u) if w_u1 is None: w_u1 = w_u @@ -835,12 +838,11 @@ # returns a "text" object (ie str in python2 and unicode in python3) if not we_are_translated(): assert type(s) is str - u = s.decode('utf-8') - w_s1 = self.interned_strings.get(u) + w_s1 = self.interned_strings.get(s) if w_s1 is None: - w_s1 = self.newunicode(u) + w_s1 = self.newtext(s) if self._side_effects_ok(): - self.interned_strings.set(u, w_s1) + self.interned_strings.set(s, w_s1) return w_s1 def _revdb_startup(self): @@ -879,11 +881,7 @@ # interface for marshal_impl if not we_are_translated(): assert type(s) is str - try: - u = s.decode('utf-8') - except UnicodeDecodeError: - return None - return self.interned_strings.get(u) # may be None + return self.interned_strings.get(s) # may be None @specialize.arg(1) def descr_self_interp_w(self, RequiredClass, w_obj): @@ -1066,7 +1064,7 @@ """ return None - def listview_unicode(self, w_list): + def listview_utf8(self, w_list): """ Return a list of unwrapped unicode out of a list of unicode. If the argument is not a list or does not contain only unicode, return None. May return None anyway. @@ -1096,8 +1094,15 @@ def newlist_bytes(self, list_s): return self.newlist([self.newbytes(s) for s in list_s]) - def newlist_unicode(self, list_u): - return self.newlist([self.newunicode(u) for u in list_u]) + def newlist_utf8(self, list_u, is_ascii): + l_w = [None] * len(list_u) + for i, item in enumerate(list_u): + if not is_ascii: + length = rutf8.check_utf8(item, True) + else: + length = len(item) + l_w[i] = self.newutf8(item, length) + return self.newlist(l_w) def newlist_int(self, list_i): return self.newlist([self.newint(i) for i in list_i]) @@ -1595,6 +1600,8 @@ else: assert False + if self.isinstance_w(w_obj, self.w_unicode): + return w_obj.charbuf_w(self) def text_or_none_w(self, w_obj): return None if self.is_none(w_obj) else self.text_w(w_obj) @@ -1617,18 +1624,22 @@ an utf-8 encoded rpython string. """ assert w_obj is not None + if not self.isinstance_w(w_obj, self.w_unicode): + w_obj._typed_unwrap_error(self, "unicode") return w_obj.text_w(self) @not_rpython # tests only; should be replaced with bytes_w or text_w def str_w(self, w_obj): """ - if w_obj is unicode, call text_w() (i.e., return the UTF-8-nosg + if w_obj is unicode, call utf8_w() (i.e., return the UTF-8-nosg encoded string). Else, call bytes_w(). We should kill str_w completely and manually substitute it with text_w/bytes_w at all call sites. It remains for now for tests only. """ + XXX # deprecated, leaving in place for clear errors if self.isinstance_w(w_obj, self.w_unicode): + # XXX lo text_w, but better to deprecate str_w than to fix this return w_obj.text_w(self) else: return w_obj.bytes_w(self) @@ -1711,23 +1722,38 @@ assert w_obj is not None return w_obj.float_w(self, allow_conversion) - @specialize.argtype(1) - def unicode_w(self, w_obj): - assert w_obj is not None - return w_obj.unicode_w(self) + def utf8_w(self, w_obj): + return w_obj.utf8_w(self) - def unicode0_w(self, w_obj): - "Like unicode_w, but rejects strings with NUL bytes." + def utf8_0_w(self, w_obj): + "Like utf_w, but rejects strings with NUL bytes." from rpython.rlib import rstring - result = w_obj.unicode_w(self) - if u'\x00' in result: + result = w_obj.utf8_w(self) + if '\x00' in result: + raise oefmt(self.w_TypeError, + "argument must be a string without NUL " + "characters") + return rstring.assert_str0(result) + + def convert_to_w_unicode(self, w_obj): + return w_obj.convert_to_w_unicode(self) + + def realunicode_w(self, w_obj): + from pypy.interpreter.unicodehelper import decode_utf8sp + utf8 = self.utf8_w(w_obj) + return decode_utf8sp(self, utf8)[0].decode('utf8') + + def utf8_0_w(self, w_obj): + "Like utf8_w, but rejects strings with NUL bytes." + from rpython.rlib import rstring + result = w_obj.utf8_w(self) + if '\x00' in result: raise oefmt(self.w_ValueError, - "argument must be a unicode string without NUL " + "argument must be a utf8 string without NUL " "characters") return rstring.assert_str0(result) realtext_w = text_w # Python 2 compatibility - realunicode_w = unicode_w def fsencode(space, w_obj): from pypy.interpreter.unicodehelper import fsencode @@ -1742,6 +1768,27 @@ w_obj = self.fsencode(w_obj) return self.bytesbuf0_w(w_obj) + def convert_arg_to_w_unicode(self, w_obj, strict=None): + # XXX why convert_to_w_unicode does something slightly different? + from pypy.objspace.std.unicodeobject import W_UnicodeObject + # for z_translation tests + if hasattr(self, 'is_fake_objspace'): return self.newtext("foobar") + return W_UnicodeObject.convert_arg_to_w_unicode(self, w_obj, strict) + + def utf8_len_w(self, w_obj): + w_obj = self.convert_arg_to_w_unicode(w_obj) + return w_obj._utf8, w_obj._len() + + def realutf8_w(self, w_obj): + # Like utf8_w(), but only works if w_obj is really of type + # 'unicode'. On Python 3 this is the same as utf8_w(). + from pypy.objspace.std.unicodeobject import W_UnicodeObject + # for z_translation tests + if hasattr(self, 'is_fake_objspace'): return self.newtext("foobar") + if not isinstance(w_obj, W_UnicodeObject): + raise oefmt(self.w_TypeError, "argument must be a unicode") + return self.utf8_w(w_obj) + def bytesbuf0_w(self, w_obj): # Like bytes0_w(), but also accept a read-only buffer. from rpython.rlib import rstring @@ -1759,7 +1806,7 @@ def fsdecode_w(self, w_obj): if self.isinstance_w(w_obj, self.w_bytes): w_obj = self.fsdecode(w_obj) - return self.unicode0_w(w_obj) + return self.utf8_w(w_obj) def bool_w(self, w_obj): # Unwraps a bool, also accepting an int for compatibility. @@ -2087,7 +2134,7 @@ 'float_w', 'uint_w', 'bigint_w', - 'unicode_w', + 'utf8_w', 'unwrap', 'is_true', 'is_w', diff --git a/pypy/interpreter/error.py b/pypy/interpreter/error.py --- a/pypy/interpreter/error.py +++ b/pypy/interpreter/error.py @@ -9,8 +9,7 @@ from rpython.rlib.objectmodel import we_are_translated, specialize from rpython.rlib.objectmodel import dont_inline, not_rpython from rpython.rlib import rstack, rstackovf -from rpython.rlib import rwin32 -from rpython.rlib import runicode +from rpython.rlib import rwin32, rutf8 from pypy.interpreter import debug @@ -21,7 +20,8 @@ def strerror(errno): """Translate an error code to a unicode message string.""" from pypy.module._codecs.locale import str_decode_locale_surrogateescape - return str_decode_locale_surrogateescape(os.strerror(errno)) + utf8, lgt = str_decode_locale_surrogateescape(os.strerror(errno)) + return utf8, lgt class OperationError(Exception): """Interpreter-level exception that signals an exception that should be @@ -72,7 +72,7 @@ space = getattr(self.w_type, 'space', None) if space is not None: if self.__class__ is not OperationError and s is None: - s = self._compute_value(space) + s, lgt = self._compute_value(space) try: s = space.text_w(s) except Exception: @@ -306,8 +306,8 @@ def get_w_value(self, space): w_value = self._w_value if w_value is None: - value = self._compute_value(space) - self._w_value = w_value = space.newunicode(value) + value, lgt = self._compute_value(space) + self._w_value = w_value = space.newtext(value, lgt) return w_value def _compute_value(self, space): @@ -472,16 +472,7 @@ assert len(formats) > 0, "unsupported: no % command found" return tuple(parts), tuple(formats) -def _decode_utf8(string): - # when building the error message, don't crash if the byte string - # provided is not valid UTF-8 - assert isinstance(string, str) - result, consumed = runicode.str_decode_utf_8( - string, len(string), "replace", final=True) - return result - def get_operrcls2(valuefmt): - valuefmt = valuefmt.decode('ascii') strings, formats = decompose_valuefmt(valuefmt) assert len(strings) == len(formats) + 1 try: @@ -501,30 +492,49 @@ def _compute_value(self, space): lst = [None] * (len(formats) + len(formats) + 1) + lgt = 0 for i, fmt, attr in entries: lst[i + i] = self.xstrings[i] + lgt += len(self.xstrings[i]) value = getattr(self, attr) if fmt == 'd': - result = str(value).decode('ascii') + result = str(value) + lgt += len(result) elif fmt == 'R': - result = space.unicode_w(space.repr(value)) + result = space.utf8_w(space.repr(value)) + lgt += len(result) elif fmt == 'S': - result = space.unicode_w(space.str(value)) + result = space.utf8_w(space.str(value)) + lgt += len(result) elif fmt == 'T': - result = _decode_utf8(space.type(value).name) + result = space.type(value).name + lgt += len(result) elif fmt == 'N': result = value.getname(space) + lgt += len(result) elif fmt == '8': - result = _decode_utf8(value) + # u'str\uxxxx' -> 'str\xXX\xXX' -> u"'str\xXX\xXX'" + from pypy.interpreter import unicodehelper + result, _lgt, pos = unicodehelper.str_decode_utf8( + value, 'replace', True, + unicodehelper.decode_never_raise, True) + lgt += _lgt + elif isinstance(value, unicode): + # 's' + result = str(value.encode('utf-8')) + lgt += len(value) else: - if isinstance(value, unicode): - result = value - else: - result = _decode_utf8(str(value)) + result = str(value) + try: + lgt += rutf8.check_utf8(result, True) + except rutf8.CheckError as e: + lgt -= e.pos lst[i + i + 1] = result lst[-1] = self.xstrings[-1] - return u''.join(lst) - # + lgt += len(self.xstrings[-1]) + retval = ''.join(lst) + return retval, lgt + _fmtcache2[formats] = OpErrFmt return OpErrFmt, strings @@ -534,7 +544,7 @@ self.setup(w_type) def _compute_value(self, space): - return self._value.decode('utf-8') + return self._value, len(self._value) def async(self, space): # also matches a RuntimeError("maximum rec.") if the stack is @@ -565,8 +575,8 @@ %8 - The result of arg.decode('utf-8') %N - The result of w_arg.getname(space) - %R - The result of space.unicode_w(space.repr(w_arg)) - %S - The result of space.unicode_w(space.str(w_arg)) + %R - The result of space.utf8_w(space.repr(w_arg)) + %S - The result of space.utf8_w(space.str(w_arg)) %T - The result of space.type(w_arg).name """ @@ -621,12 +631,13 @@ if rwin32.WIN32 and isinstance(e, WindowsError): winerror = e.winerror try: - msg = rwin32.FormatErrorW(winerror) + msg, lgt = rwin32.FormatErrorW(winerror) except ValueError: - msg = u'Windows Error %d' % winerror + msg = 'Windows Error %d' % winerror + lgt = len(msg) w_errno = space.w_None w_winerror = space.newint(winerror) - w_msg = space.newunicode(msg) + w_msg = space.newtext(msg, lgt) else: errno = e.errno if errno == EINTR: @@ -635,12 +646,13 @@ return None try: - msg = strerror(errno) + msg, lgt = strerror(errno) except ValueError: - msg = u'error %d' % errno + msg = 'error %d' % errno + lgt = len(msg) w_errno = space.newint(errno) w_winerror = space.w_None - w_msg = space.newunicode(msg) + w_msg = space.newtext(msg, lgt) if w_filename is None: w_filename = space.w_None @@ -670,9 +682,9 @@ eintr_retry=eintr_retry) def exception_from_errno(space, w_type, errno): - msg = strerror(errno) + msg, lgt = strerror(errno) w_error = space.call_function(w_type, space.newint(errno), - space.newunicode(msg)) + space.newtext(msg, lgt)) return OperationError(w_type, w_error) def exception_from_saved_errno(space, w_type): diff --git a/pypy/interpreter/function.py b/pypy/interpreter/function.py --- a/pypy/interpreter/function.py +++ b/pypy/interpreter/function.py @@ -45,7 +45,8 @@ closure=None, w_ann=None, forcename=None, qualname=None): self.space = space self.name = forcename or code.co_name - self.qualname = qualname or self.name.decode('utf-8') + self.qualname = qualname or self.name + assert isinstance(self.qualname, str) self.w_doc = None # lazily read from code.getdocstring() self.code = code # Code instance self.w_func_globals = w_globals # the globals dictionary @@ -255,7 +256,7 @@ return self.call_args(__args__) def descr_function_repr(self): - return self.getrepr(self.space, u'function %s' % self.qualname) + return self.getrepr(self.space, 'function %s' % self.qualname) def _cleanup_(self): @@ -313,7 +314,7 @@ tup_base = [] tup_state = [ space.newtext(self.name), - space.newunicode(self.qualname), + space.newtext(self.qualname), w_doc, self.code, w_func_globals, @@ -337,7 +338,7 @@ self.space = space self.name = space.text_w(w_name) - self.qualname = space.unicode_w(w_qualname) + self.qualname = space.utf8_w(w_qualname) self.code = space.interp_w(Code, w_code) if not space.is_w(w_closure, space.w_None): from pypy.interpreter.nestedscope import Cell @@ -430,11 +431,11 @@ "__name__ must be set to a string object") def fget_func_qualname(self, space): - return space.newunicode(self.qualname) + return space.newtext(self.qualname) def fset_func_qualname(self, space, w_name): try: - self.qualname = space.unicode_w(w_name) + self.qualname = space.realutf8_w(w_name) except OperationError as e: if e.match(space, space.w_TypeError): raise oefmt(space.w_TypeError, @@ -549,14 +550,14 @@ name = self.w_function.getname(self.space) else: try: - name = space.unicode_w(w_name) + name = space.utf8_w(w_name) except OperationError as e: if not e.match(space, space.w_TypeError): raise - name = u'?' - objrepr = space.unicode_w(space.repr(self.w_instance)) - s = u'<bound method %s of %s>' % (name, objrepr) - return space.newunicode(s) + name = '?' + objrepr = space.utf8_w(space.repr(self.w_instance)) + s = b'<bound method %s of %s>' % (name, objrepr) + return space.newtext(s) def descr_method_getattribute(self, w_attr): space = self.space @@ -598,7 +599,7 @@ else: w_builtins = space.getbuiltinmodule('builtins') new_inst = space.getattr(w_builtins, space.newtext('getattr')) - tup = [w_instance, space.newunicode(w_function.getname(space))] + tup = [w_instance, space.newtext(w_function.getname(space))] return space.newtuple([new_inst, space.newtuple(tup)]) @@ -699,7 +700,7 @@ return self.space.newtext('<built-in function %s>' % (self.name,)) def descr__reduce__(self, space): - return space.newunicode(self.qualname) + return space.newtext(self.qualname) def is_builtin_code(w_func): from pypy.interpreter.gateway import BuiltinCode diff --git a/pypy/interpreter/gateway.py b/pypy/interpreter/gateway.py --- a/pypy/interpreter/gateway.py +++ b/pypy/interpreter/gateway.py @@ -174,6 +174,9 @@ def visit_unicode(self, el, app_sig): self.checked_space_method(el, app_sig) + def visit_utf8(self, el, app_sig): + self.checked_space_method(el, app_sig) + def visit_fsencode(self, el, app_sig): self.checked_space_method(el, app_sig) @@ -324,7 +327,10 @@ self.run_args.append("space.text0_w(%s)" % (self.scopenext(),)) def visit_unicode(self, typ): - self.run_args.append("space.unicode_w(%s)" % (self.scopenext(),)) + self.run_args.append("space.realunicode_w(%s)" % (self.scopenext(),)) + + def visit_utf8(self, typ): + self.run_args.append("space.utf8_w(%s)" % (self.scopenext(),)) def visit_fsencode(self, typ): self.run_args.append("space.fsencode_w(%s)" % (self.scopenext(),)) @@ -492,11 +498,14 @@ self.unwrap.append("space.text_w(%s)" % (self.nextarg(),)) def visit_unicode(self, typ): - self.unwrap.append("space.unicode_w(%s)" % (self.nextarg(),)) + self.unwrap.append("space.realunicode_w(%s)" % (self.nextarg(),)) def visit_text0(self, typ): self.unwrap.append("space.text0_w(%s)" % (self.nextarg(),)) + def visit_utf8(self, typ): + self.unwrap.append("space.utf8_w(%s)" % (self.nextarg(),)) + def visit_fsencode(self, typ): self.unwrap.append("space.fsencode_w(%s)" % (self.nextarg(),)) @@ -567,8 +576,10 @@ assert typ in (int, str, float, unicode, r_longlong, r_uint, r_ulonglong, bool) if typ is r_int is r_longlong: return 'gateway_r_longlong_w' - elif typ in (str, unicode): - return typ.__name__ + '_w' + elif typ is str: + return 'utf8_w' + elif typ is unicode: + return 'realunicode_w' elif typ is bool: # For argument clinic's "bool" specifier: accept any object, and # convert it to a boolean value. If you don't want this @@ -1113,7 +1124,7 @@ kw_defs_w = [] for name, w_def in sorted(alldefs_w.items()): assert name in sig.kwonlyargnames - w_name = space.newunicode(name.decode('utf-8')) + w_name = space.newtext(name) kw_defs_w.append((w_name, w_def)) return defs_w, kw_defs_w diff --git a/pypy/interpreter/generator.py b/pypy/interpreter/generator.py --- a/pypy/interpreter/generator.py +++ b/pypy/interpreter/generator.py @@ -38,14 +38,12 @@ # 'qualname' is a unicode string if self._qualname is not None: return self._qualname - return self.get_name().decode('utf-8') + return self.get_name() def descr__repr__(self, space): addrstring = self.getaddrstring(space) - return space.newunicode(u"<%s object %s at 0x%s>" % - (unicode(self.KIND), - self.get_qualname(), - unicode(addrstring))) + return space.newtext("<%s object %s at 0x%s>" % + (self.KIND, self.get_qualname(), addrstring)) def descr_send(self, w_arg): """send(arg) -> send 'arg' into generator/coroutine, @@ -215,7 +213,7 @@ e2.record_context(space, space.getexecutioncontext()) raise e2 else: - space.warn(space.newunicode(u"generator '%s' raised StopIteration" + space.warn(space.newtext("generator '%s' raised StopIteration" % self.get_qualname()), space.w_PendingDeprecationWarning) @@ -308,11 +306,11 @@ "__name__ must be set to a string object") def descr__qualname__(self, space): - return space.newunicode(self.get_qualname()) + return space.newtext(self.get_qualname()) def descr_set__qualname__(self, space, w_name): try: - self._qualname = space.unicode_w(w_name) + self._qualname = space.utf8_w(w_name) except OperationError as e: if e.match(space, space.w_TypeError): raise oefmt(space.w_TypeError, @@ -399,8 +397,8 @@ self.frame is not None and \ self.frame.last_instr == -1: space = self.space - msg = u"coroutine '%s' was never awaited" % self.get_qualname() - space.warn(space.newunicode(msg), space.w_RuntimeWarning) + msg = "coroutine '%s' was never awaited" % self.get_qualname() + space.warn(space.newtext(msg), space.w_RuntimeWarning) GeneratorOrCoroutine._finalize_(self) diff --git a/pypy/interpreter/mixedmodule.py b/pypy/interpreter/mixedmodule.py --- a/pypy/interpreter/mixedmodule.py +++ b/pypy/interpreter/mixedmodule.py @@ -130,7 +130,7 @@ bltin.w_module = self.w_name func._builtinversion_ = bltin bltin.name = name - bltin.qualname = bltin.name.decode('utf-8') + bltin.qualname = bltin.name w_value = bltin space.setitem(self.w_dict, w_name, w_value) return w_value diff --git a/pypy/interpreter/pycode.py b/pypy/interpreter/pycode.py --- a/pypy/interpreter/pycode.py +++ b/pypy/interpreter/pycode.py @@ -206,7 +206,7 @@ self.co_filename = '<builtin>/%s' % (basename,) self.w_filename = self.space.newfilename(self.co_filename) - co_names = property(lambda self: [self.space.str_w(w_name) for w_name in self.co_names_w]) # for trace + co_names = property(lambda self: [self.space.text_w(w_name) for w_name in self.co_names_w]) # for trace def signature(self): return self._signature @@ -452,8 +452,8 @@ def repr(self, space): space = self.space # co_name should be an identifier - name = self.co_name.decode('utf-8') - fn = space.unicode_w(self.w_filename) - return space.newunicode(u'<code object %s at 0x%s, file "%s", line %d>' % ( - name, unicode(self.getaddrstring(space)), fn, + name = self.co_name + fn = space.utf8_w(self.w_filename) + return space.newtext(b'<code object %s at 0x%s, file "%s", line %d>' % ( + name, self.getaddrstring(space), fn, -1 if self.co_firstlineno == 0 else self.co_firstlineno)) diff --git a/pypy/interpreter/pyopcode.py b/pypy/interpreter/pyopcode.py --- a/pypy/interpreter/pyopcode.py +++ b/pypy/interpreter/pyopcode.py @@ -1081,8 +1081,8 @@ try: w_pkgname = space.getattr( w_module, space.newtext('__name__')) - w_fullname = space.newunicode(u'%s.%s' % - (space.unicode_w(w_pkgname), space.unicode_w(w_name))) + w_fullname = space.newtext(b'%s.%s' % + (space.utf8_w(w_pkgname), space.utf8_w(w_name))) return space.getitem(space.sys.get('modules'), w_fullname) except OperationError: raise oefmt( @@ -1333,7 +1333,7 @@ def _make_function(self, oparg, freevars=None): space = self.space w_qualname = self.popvalue() - qualname = self.space.unicode_w(w_qualname) + qualname = self.space.utf8_w(w_qualname) w_codeobj = self.popvalue() codeobj = self.space.interp_w(PyCode, w_codeobj) if freevars is not None: @@ -1628,7 +1628,7 @@ if (oparg & consts.FVS_MASK) == consts.FVS_HAVE_SPEC: w_spec = self.popvalue() else: - w_spec = space.newunicode(u'') + w_spec = space.newtext('') w_value = self.popvalue() # conversion = oparg & consts.FVC_MASK @@ -1649,9 +1649,9 @@ lst = [] for i in range(itemcount-1, -1, -1): w_item = self.peekvalue(i) - lst.append(space.unicode_w(w_item)) + lst.append(space.utf8_w(w_item)) self.dropvalues(itemcount) - w_res = space.newunicode(u''.join(lst)) + w_res = space.newtext(''.join(lst)) self.pushvalue(w_res) def _revdb_load_var(self, oparg): diff --git a/pypy/interpreter/pyparser/error.py b/pypy/interpreter/pyparser/error.py --- a/pypy/interpreter/pyparser/error.py +++ b/pypy/interpreter/pyparser/error.py @@ -29,20 +29,24 @@ except: # we can't allow any exceptions here! return None""") elif self.text is not None: - from rpython.rlib.runicode import str_decode_utf_8 + from rpython.rlib.runicode import str_decode_utf_8_impl # self.text may not be UTF-8 in case of decoding errors. # adjust the encoded text offset to a decoded offset # XXX do the right thing about continuation lines, which # XXX are their own fun, sometimes giving offset > # XXX len(self.text) for example (right now, avoid crashing) + def replace_error_handler(errors, encoding, msg, s, startpos, endpos): + # must return unicode + return u'\ufffd', endpos if offset > len(self.text): offset = len(self.text) - text, _ = str_decode_utf_8(self.text, offset, 'replace') + text, _ = str_decode_utf_8_impl(self.text, offset, + 'replace', False, replace_error_handler, True) offset = len(text) if len(self.text) != offset: - text, _ = str_decode_utf_8(self.text, len(self.text), - 'replace') - w_text = space.newunicode(text) + text, _ = str_decode_utf_8_impl(self.text, len(self.text), + 'replace', False, replace_error_handler, True) + w_text = space.newtext(text.encode('utf8'), len(text)) return space.newtuple([ space.newtext(self.msg), space.newtuple([ diff --git a/pypy/interpreter/pyparser/parsestring.py b/pypy/interpreter/pyparser/parsestring.py --- a/pypy/interpreter/pyparser/parsestring.py +++ b/pypy/interpreter/pyparser/parsestring.py @@ -1,4 +1,5 @@ # coding: utf-8 +from rpython.rlib import rutf8 from pypy.interpreter.baseobjspace import W_Root from pypy.interpreter.error import OperationError, oefmt from pypy.interpreter import unicodehelper @@ -91,9 +92,11 @@ if encoding is None: substr = s[ps:q] else: + unicodehelper.check_utf8_or_raise(space, s, ps, q) substr = decode_unicode_utf8(space, s, ps, q) - v = unicodehelper.decode_unicode_escape(space, substr) - return space.newunicode(v) + r = unicodehelper.decode_unicode_escape(space, substr) + v, length, pos = r + return space.newutf8(v, length) assert 0 <= ps <= q substr = s[ps : q] @@ -111,8 +114,8 @@ elif saw_f: return W_FString(substr, rawmode) else: - v = unicodehelper.decode_utf8(space, substr) - return space.newunicode(v) + v = unicodehelper.str_decode_utf8(substr, 'strict', True, None) + return space.newtext(*v) v = PyString_DecodeEscape(space, substr, 'strict', encoding) return space.newbytes(v) @@ -135,15 +138,12 @@ # the backslash we just wrote, we emit "\u005c" # instead. lis.append("u005c") - if ord(s[ps]) & 0x80: # XXX inefficient - w, ps = decode_utf8(space, s, ps, end) - for c in w: - # The equivalent of %08x, which is not supported by RPython. - # 7 zeroes are enough for the unicode range, and the - # result still fits in 32-bit. - hexa = hex(ord(c) + 0x10000000) - lis.append('\\U0') - lis.append(hexa[3:]) # Skip 0x and the leading 1 + if ord(s[ps]) & 0x80: + cp = rutf8.codepoint_at_pos(s, ps) + hexa = hex(cp + 0x10000000) + lis.append('\\U0') + lis.append(hexa[3:]) # Skip 0x and the leading 1 + ps = rutf8.next_codepoint_pos(s, ps) else: lis.append(s[ps]) ps += 1 @@ -250,20 +250,29 @@ ch >= 'A' and ch <= 'F') -def decode_utf8(space, s, ps, end): +def check_utf8(space, s, ps, end): assert ps >= 0 pt = ps # while (s < end && *s != '\\') s++; */ /* inefficient for u".." while ps < end and ord(s[ps]) & 0x80: ps += 1 - u = unicodehelper.decode_utf8(space, s[pt:ps]) - return u, ps + try: + rutf8.check_utf8(s, True, pt, ps) + except rutf8.CheckError as e: + lgt, flag = rutf8.check_utf8(s, True, pt, e.pos) + unicodehelper.decode_error_handler(space)('strict', 'utf8', + 'invalid utf-8', s, pt + lgt, pt + lgt + 1) + return s[pt:ps] def decode_utf8_recode(space, s, ps, end, recode_encoding): - u, ps = decode_utf8(space, s, ps, end) - w_v = unicodehelper.encode(space, space.newunicode(u), recode_encoding) + p = ps + while p < end and ord(s[p]) & 0x80: + p += 1 + lgt = unicodehelper.check_utf8_or_raise(space, s, ps, p) + w_v = unicodehelper.encode(space, space.newutf8(s[ps:p], lgt), + recode_encoding) v = space.bytes_w(w_v) - return v, ps + return v, p def raise_app_valueerror(space, msg): raise OperationError(space.w_ValueError, space.newtext(msg)) diff --git a/pypy/interpreter/pyparser/pytokenizer.py b/pypy/interpreter/pyparser/pytokenizer.py --- a/pypy/interpreter/pyparser/pytokenizer.py +++ b/pypy/interpreter/pyparser/pytokenizer.py @@ -6,6 +6,7 @@ from pypy.interpreter.pyparser.pytokenize import tabsize, alttabsize, whiteSpaceDFA, \ triple_quoted, endDFAs, single_quoted, pseudoDFA from pypy.interpreter.astcompiler import consts +from rpython.rlib import rutf8 NAMECHARS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_' NUMCHARS = '0123456789' @@ -46,14 +47,9 @@ def verify_utf8(token): - for c in token: - if ord(c) >= 0x80: - break - else: - return True try: - u = token.decode('utf-8') - except UnicodeDecodeError: + rutf8.check_utf8(token, False) + except rutf8.CheckError: return False return True @@ -69,17 +65,12 @@ def verify_identifier(token): # 1=ok; 0=not an identifier; -1=bad utf-8 - for c in token: - if ord(c) >= 0x80: - break - else: - return 1 try: - u = token.decode('utf-8') - except UnicodeDecodeError: + rutf8.check_utf8(token, False) + except rutf8.CheckError: return -1 from pypy.objspace.std.unicodeobject import _isidentifier - return _isidentifier(u) + return _isidentifier(token) DUMMY_DFA = automata.DFA([], []) diff --git a/pypy/interpreter/pyparser/test/test_parsestring.py b/pypy/interpreter/pyparser/test/test_parsestring.py --- a/pypy/interpreter/pyparser/test/test_parsestring.py +++ b/pypy/interpreter/pyparser/test/test_parsestring.py @@ -10,7 +10,7 @@ assert space.bytes_w(w_ret) == value elif isinstance(value, unicode): assert space.type(w_ret) == space.w_unicode - assert space.unicode_w(w_ret) == value + assert space.utf8_w(w_ret).decode('utf8') == value else: assert False @@ -61,7 +61,7 @@ s = "u'\x81'" s = s.decode("koi8-u").encode("utf8")[1:] w_ret = parsestring.parsestr(self.space, 'koi8-u', s) - ret = space.unwrap(w_ret) + ret = w_ret._utf8.decode('utf8') assert ret == eval("# -*- coding: koi8-u -*-\nu'\x81'") def test_unicode_pep414(self): @@ -112,14 +112,14 @@ space = self.space s = '"""' + '\\' + '\n"""' w_ret = parsestring.parsestr(space, None, s) - assert space.str_w(w_ret) == '' + assert space.text_w(w_ret) == '' def test_bug1(self): space = self.space expected = ['x', ' ', chr(0xc3), chr(0xa9), ' ', '\n'] input = ["'", 'x', ' ', chr(0xc3), chr(0xa9), ' ', chr(92), 'n', "'"] w_ret = parsestring.parsestr(space, 'utf8', ''.join(input)) - assert space.str_w(w_ret) == ''.join(expected) + assert space.text_w(w_ret) == ''.join(expected) def test_wide_unicode_in_source(self): if sys.maxunicode == 65535: @@ -131,7 +131,4 @@ def test_decode_unicode_utf8(self): buf = parsestring.decode_unicode_utf8(self.space, 'u"\xf0\x9f\x92\x8b"', 2, 6) - if sys.maxunicode == 65535: - assert buf == r"\U0000d83d\U0000dc8b" - else: - assert buf == r"\U0001f48b" + assert buf == r"\U0001f48b" diff --git a/pypy/interpreter/test/test_appinterp.py b/pypy/interpreter/test/test_appinterp.py --- a/pypy/interpreter/test/test_appinterp.py +++ b/pypy/interpreter/test/test_appinterp.py @@ -155,7 +155,7 @@ w_mymod2 = MyModule(space2, space2.wrap('mymod')) w_str = space1.getattr(w_mymod1, space1.wrap("hi")) - assert space1.str_w(w_str) == "hello" + assert space1.text_w(w_str) == "hello" class TestMixedModuleUnfreeze: spaceconfig = dict(usemodules=('_socket',)) diff --git a/pypy/interpreter/test/test_argument.py b/pypy/interpreter/test/test_argument.py --- a/pypy/interpreter/test/test_argument.py +++ b/pypy/interpreter/test/test_argument.py @@ -55,6 +55,9 @@ pass class DummySpace(object): + class sys: + defaultencoding = 'utf-8' + def newtuple(self, items): return tuple(items) @@ -92,16 +95,15 @@ def getitem(self, obj, key): return obj[key] - def wrap(self, obj): + def wrap(self, obj, lgt=-1): return obj newtext = wrap - newunicode = wrap def text_w(self, s): - return self.unicode_w(s).encode('utf-8') + return self.utf8_w(s) - def unicode_w(self, s): - return unicode(s) + def utf8_w(self, s): + return s def len(self, x): return len(x) @@ -135,7 +137,7 @@ def type(self, obj): class Type: def getname(self, space): - return unicode(type(obj).__name__) + return type(obj).__name__ return Type() @@ -343,14 +345,14 @@ def test_unwrap_error(self): space = DummySpace() valuedummy = object() - def unicode_w(w): + def utf8_w(w): if w is None: raise OperationError(TypeError, None) if w is valuedummy: raise OperationError(ValueError, None) - return str(w) - space.unicode_w = unicode_w - space.text_w = unicode_w + return bytes(w, 'utf-8') + space.utf8_w = utf8_w + space.text_w = utf8_w excinfo = py.test.raises(OperationError, Arguments, space, [], ["a"], [1], w_starstararg={None: 1}) assert excinfo.value.w_type is TypeError @@ -672,14 +674,14 @@ try: Arguments(space, [], w_stararg=space.wrap(42)) except OperationError as e: - msg = space.str_w(space.str(e.get_w_value(space))) + msg = space.text_w(space.str(e.get_w_value(space))) assert msg == "argument after * must be an iterable, not int" else: assert 0, "did not raise" try: Arguments(space, [], w_starstararg=space.wrap(42)) except OperationError as e: - msg = space.str_w(space.str(e.get_w_value(space))) + msg = space.text_w(space.str(e.get_w_value(space))) assert msg == "argument after ** must be a mapping, not int" else: assert 0, "did not raise" @@ -838,7 +840,6 @@ _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit