Author: Matti Picus <matti.pi...@gmail.com> Branch: unicode-utf8-py3 Changeset: r95386:3a259b9aeedd Date: 2018-11-29 08:02 -0800 http://bitbucket.org/pypy/pypy/changeset/3a259b9aeedd/
Log: test, fix surrogates in _sre encoding diff --git a/pypy/module/_sre/interp_sre.py b/pypy/module/_sre/interp_sre.py --- a/pypy/module/_sre/interp_sre.py +++ b/pypy/module/_sre/interp_sre.py @@ -43,7 +43,8 @@ return space.newbytes(ctx._string[start:end]) elif isinstance(ctx, rsre_core.UnicodeMatchContext): uni = ctx._unicodestr[start:end] - uni_utf8 = unicode_encode_utf_8(uni, len(uni), 'strict') + uni_utf8 = unicode_encode_utf_8(uni, len(uni), 'strict', + allow_surrogates=True) return space.newtext(uni_utf8, len(uni)) else: # unreachable diff --git a/pypy/module/_sre/test/test_app_sre.py b/pypy/module/_sre/test/test_app_sre.py --- a/pypy/module/_sre/test/test_app_sre.py +++ b/pypy/module/_sre/test/test_app_sre.py @@ -116,6 +116,8 @@ assert ['', 'a', 'l', 'a', 'lla'] == re.split("b(a)", "balballa") assert ['', 'a', None, 'l', 'u', None, 'lla'] == ( re.split("b([ua]|(s))", "balbulla")) + assert ['Hello \udce2\udc9c\udc93', ''] == re.split(r'\r\n|\r|\n', + 'Hello \udce2\udc9c\udc93\n') def test_weakref(self): import re, _weakref _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit