Author: Matti Picus <matti.pi...@gmail.com>
Branch: unicode-utf8-py3
Changeset: r95386:3a259b9aeedd
Date: 2018-11-29 08:02 -0800
http://bitbucket.org/pypy/pypy/changeset/3a259b9aeedd/

Log:    test, fix surrogates in _sre encoding

diff --git a/pypy/module/_sre/interp_sre.py b/pypy/module/_sre/interp_sre.py
--- a/pypy/module/_sre/interp_sre.py
+++ b/pypy/module/_sre/interp_sre.py
@@ -43,7 +43,8 @@
             return space.newbytes(ctx._string[start:end])
         elif isinstance(ctx, rsre_core.UnicodeMatchContext):
             uni = ctx._unicodestr[start:end]
-            uni_utf8 = unicode_encode_utf_8(uni, len(uni), 'strict')
+            uni_utf8 = unicode_encode_utf_8(uni, len(uni), 'strict',
+                                            allow_surrogates=True)
             return space.newtext(uni_utf8, len(uni))
         else:
             # unreachable
diff --git a/pypy/module/_sre/test/test_app_sre.py 
b/pypy/module/_sre/test/test_app_sre.py
--- a/pypy/module/_sre/test/test_app_sre.py
+++ b/pypy/module/_sre/test/test_app_sre.py
@@ -116,6 +116,8 @@
         assert ['', 'a', 'l', 'a', 'lla'] == re.split("b(a)", "balballa")
         assert ['', 'a', None, 'l', 'u', None, 'lla'] == (
             re.split("b([ua]|(s))", "balbulla"))
+        assert ['Hello \udce2\udc9c\udc93', ''] == re.split(r'\r\n|\r|\n',
+                    'Hello \udce2\udc9c\udc93\n')
 
     def test_weakref(self):
         import re, _weakref
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to