Author: Matti Picus <[email protected]>
Branch: py3.6
Changeset: r96067:813c99f810ac
Date: 2019-02-18 15:07 +0200
http://bitbucket.org/pypy/pypy/changeset/813c99f810ac/
Log: collect surrogate pairs for error
diff --git a/pypy/interpreter/unicodehelper.py
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -239,8 +239,19 @@
upos += rutf8.codepoints_in_utf8(s, start=pos, end=e.pos)
pos = e.pos
assert pos >= 0
+ # Try to get collect surrogates in one pass
+ # XXX do we care about performance in this case?
+ # XXX should this loop for more than one pair?
+ delta = 1
+ uchr = rutf8.codepoint_at_pos(s, pos)
+ if 0xD800 <= uchr <= 0xDBFF:
+ pos = rutf8.next_codepoint_pos(s, pos)
+ if pos < size:
+ uchr = rutf8.codepoint_at_pos(s, pos)
+ if 0xDC00 <= uchr <= 0xDFFF:
+ delta += 1
res, newindex, rettype = errorhandler(errors, 'utf8',
- 'surrogates not allowed', s, upos, upos + 1)
+ 'surrogates not allowed', s, upos, upos + delta)
if rettype == 'u':
for cp in rutf8.Utf8StringIterator(res):
result.append(chr(cp))
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit