Author: Matti Picus <[email protected]>
Branch: unicode-utf8-py3
Changeset: r95386:3a259b9aeedd
Date: 2018-11-29 08:02 -0800
http://bitbucket.org/pypy/pypy/changeset/3a259b9aeedd/
Log: test, fix surrogates in _sre encoding
diff --git a/pypy/module/_sre/interp_sre.py b/pypy/module/_sre/interp_sre.py
--- a/pypy/module/_sre/interp_sre.py
+++ b/pypy/module/_sre/interp_sre.py
@@ -43,7 +43,8 @@
return space.newbytes(ctx._string[start:end])
elif isinstance(ctx, rsre_core.UnicodeMatchContext):
uni = ctx._unicodestr[start:end]
- uni_utf8 = unicode_encode_utf_8(uni, len(uni), 'strict')
+ uni_utf8 = unicode_encode_utf_8(uni, len(uni), 'strict',
+ allow_surrogates=True)
return space.newtext(uni_utf8, len(uni))
else:
# unreachable
diff --git a/pypy/module/_sre/test/test_app_sre.py
b/pypy/module/_sre/test/test_app_sre.py
--- a/pypy/module/_sre/test/test_app_sre.py
+++ b/pypy/module/_sre/test/test_app_sre.py
@@ -116,6 +116,8 @@
assert ['', 'a', 'l', 'a', 'lla'] == re.split("b(a)", "balballa")
assert ['', 'a', None, 'l', 'u', None, 'lla'] == (
re.split("b([ua]|(s))", "balbulla"))
+ assert ['Hello \udce2\udc9c\udc93', ''] == re.split(r'\r\n|\r|\n',
+ 'Hello \udce2\udc9c\udc93\n')
def test_weakref(self):
import re, _weakref
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit