Author: Ronan Lamy <[email protected]>
Branch: unicode-utf8
Changeset: r93385:f6e0cc1e875e
Date: 2017-12-12 16:28 +0000
http://bitbucket.org/pypy/pypy/changeset/f6e0cc1e875e/
Log: Use Utf8StringBuilder in decode_unicode_escape() and fix handling of
invalid \U escapes
diff --git a/pypy/interpreter/unicodehelper.py
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -351,56 +351,48 @@
endinpos = pos
while endinpos < len(s) and s[endinpos] in hexdigits:
endinpos += 1
- res, pos = errorhandler(errors, encoding,
- message, s, pos-2, endinpos)
- size = rutf8.check_utf8(res, True)
+ res, pos = errorhandler(
+ errors, encoding, message, s, pos - 2, endinpos)
builder.append(res)
else:
try:
- chr = r_uint(int(s[pos:pos+digits], 16))
+ chr = r_uint(int(s[pos:pos + digits], 16))
except ValueError:
endinpos = pos
while s[endinpos] in hexdigits:
endinpos += 1
- res, pos = errorhandler(errors, encoding,
- message, s, pos-2, endinpos)
- size = rutf8.check_utf8(res, True)
+ res, pos = errorhandler(
+ errors, encoding, message, s, pos - 2, endinpos)
builder.append(res)
else:
# when we get here, chr is a 32-bit unicode character
try:
- rutf8.unichr_as_utf8_append(builder, intmask(chr), True)
+ builder.append_code(chr)
+ pos += digits
except ValueError:
message = "illegal Unicode character"
- res, pos = errorhandler(errors, encoding,
- message, s, pos-2, pos+digits)
- size = rutf8.check_utf8(res, True)
+ res, pos = errorhandler(
+ errors, encoding, message, s, pos - 2, pos + digits)
builder.append(res)
- else:
- pos += digits
- size = 1
-
- return pos, size
+ return pos
def str_decode_unicode_escape(s, errors, final, errorhandler, ud_handler):
size = len(s)
if size == 0:
return '', 0, 0
- builder = StringBuilder(size)
+ builder = rutf8.Utf8StringBuilder(size)
pos = 0
- outsize = 0
while pos < size:
ch = s[pos]
# Non-escape characters are interpreted as Unicode ordinals
if ch != '\\':
if ord(ch) > 0x7F:
- rutf8.unichr_as_utf8_append(builder, ord(ch))
+ builder.append_code(ord(ch))
else:
builder.append(ch)
pos += 1
- outsize += 1
continue
# - Escapes
@@ -408,88 +400,70 @@
if pos >= size:
message = "\\ at end of string"
res, pos = errorhandler(errors, "unicodeescape",
- message, s, pos-1, size)
- newsize = rutf8.check_utf8(res, True)
- outsize + newsize
+ message, s, pos - 1, size)
builder.append(res)
continue
ch = s[pos]
pos += 1
# \x escapes
- if ch == '\n': pass
+ if ch == '\n':
+ pass
elif ch == '\\':
- builder.append('\\')
- outsize += 1
+ builder.append_char('\\')
elif ch == '\'':
- builder.append('\'')
- outsize += 1
+ builder.append_char('\'')
elif ch == '\"':
- builder.append('\"')
- outsize += 1
- elif ch == 'b' :
- builder.append('\b')
- outsize += 1
- elif ch == 'f' :
- builder.append('\f')
- outsize += 1
- elif ch == 't' :
- builder.append('\t')
- outsize += 1
- elif ch == 'n' :
- builder.append('\n')
- outsize += 1
- elif ch == 'r' :
- builder.append('\r')
- outsize += 1
- elif ch == 'v' :
- builder.append('\v')
- outsize += 1
- elif ch == 'a' :
- builder.append('\a')
- outsize += 1
+ builder.append_char('\"')
+ elif ch == 'b':
+ builder.append_char('\b')
+ elif ch == 'f':
+ builder.append_char('\f')
+ elif ch == 't':
+ builder.append_char('\t')
+ elif ch == 'n':
+ builder.append_char('\n')
+ elif ch == 'r':
+ builder.append_char('\r')
+ elif ch == 'v':
+ builder.append_char('\v')
+ elif ch == 'a':
+ builder.append_char('\a')
elif '0' <= ch <= '7':
x = ord(ch) - ord('0')
if pos < size:
ch = s[pos]
if '0' <= ch <= '7':
pos += 1
- x = (x<<3) + ord(ch) - ord('0')
+ x = (x << 3) + ord(ch) - ord('0')
if pos < size:
ch = s[pos]
if '0' <= ch <= '7':
pos += 1
- x = (x<<3) + ord(ch) - ord('0')
- outsize += 1
+ x = (x << 3) + ord(ch) - ord('0')
if x > 0x7F:
- rutf8.unichr_as_utf8_append(builder, x)
+ builder.append_code(x)
else:
- builder.append(chr(x))
+ builder.append_char(chr(x))
# hex escapes
# \xXX
elif ch == 'x':
digits = 2
message = "truncated \\xXX escape"
- pos, newsize = hexescape(builder, s, pos, digits,
+ pos = hexescape(builder, s, pos, digits,
"unicodeescape", errorhandler, message, errors)
- outsize += newsize
-
# \uXXXX
elif ch == 'u':
digits = 4
message = "truncated \\uXXXX escape"
- pos, newsize = hexescape(builder, s, pos, digits,
+ pos = hexescape(builder, s, pos, digits,
"unicodeescape", errorhandler, message, errors)
- outsize += newsize
-
# \UXXXXXXXX
elif ch == 'U':
digits = 8
message = "truncated \\UXXXXXXXX escape"
- pos, newsize = hexescape(builder, s, pos, digits,
+ pos = hexescape(builder, s, pos, digits,
"unicodeescape", errorhandler, message, errors)
- outsize += newsize
-
# \N{name}
elif ch == 'N' and ud_handler is not None:
message = "malformed \\N character escape"
@@ -502,38 +476,29 @@
if look < size and s[look] == '}':
# found a name. look it up in the unicode database
message = "unknown Unicode character name"
- name = s[pos+1:look]
+ name = s[pos + 1:look]
code = ud_handler.call(name)
if code < 0:
- res, pos = errorhandler(errors, "unicodeescape",
- message, s, pos-1, look+1)
- newsize = rutf8.check_utf8(res, True)
- outsize += newsize
+ res, pos = errorhandler(
+ errors, "unicodeescape", message,
+ s, pos - 1, look + 1)
builder.append(res)
continue
pos = look + 1
- outsize += 1
- rutf8.unichr_as_utf8_append(builder, code,
- allow_surrogates=True)
- # xxx 'code' is probably always within range here...
+ builder.append_code(code)
else:
res, pos = errorhandler(errors, "unicodeescape",
- message, s, pos-1, look+1)
- newsize = rutf8.check_utf8(res, True)
- outsize += newsize
+ message, s, pos - 1, look + 1)
builder.append(res)
else:
res, pos = errorhandler(errors, "unicodeescape",
- message, s, pos-1, look+1)
- newsize = rutf8.check_utf8(res, True)
- outsize += newsize
+ message, s, pos - 1, look + 1)
builder.append(res)
else:
- builder.append('\\')
- builder.append(ch)
- outsize += 2
+ builder.append_char('\\')
+ builder.append_code(ord(ch))
- return builder.build(), pos, outsize
+ return builder.build(), pos, builder.get_length()
def wcharpsize2utf8(space, wcharp, size):
"""Safe version of rffi.wcharpsize2utf8.
@@ -557,14 +522,14 @@
if size == 0:
return '', 0, 0
- result = StringBuilder(size)
+ builder = rutf8.Utf8StringBuilder(size)
pos = 0
while pos < size:
ch = s[pos]
# Non-escape characters are interpreted as Unicode ordinals
if ch != '\\':
- rutf8.unichr_as_utf8_append(result, ord(ch), True)
+ builder.append_code(ord(ch))
pos += 1
continue
@@ -575,30 +540,27 @@
pos += 1
if pos == size or s[pos] != '\\':
break
- result.append('\\')
+ builder.append_char('\\')
# we have a backslash at the end of the string, stop here
if pos >= size:
- result.append('\\')
+ builder.append_char('\\')
break
- if ((pos - bs) & 1 == 0 or
- pos >= size or
- (s[pos] != 'u' and s[pos] != 'U')):
- result.append('\\')
- rutf8.unichr_as_utf8_append(result, ord(s[pos]), True)
+ if ((pos - bs) & 1 == 0 or pos >= size or
+ (s[pos] != 'u' and s[pos] != 'U')):
+ builder.append_char('\\')
+ builder.append_code(ord(s[pos]))
pos += 1
continue
digits = 4 if s[pos] == 'u' else 8
message = "truncated \\uXXXX"
pos += 1
- pos, _ = hexescape(result, s, pos, digits,
+ pos = hexescape(builder, s, pos, digits,
"rawunicodeescape", errorhandler, message, errors)
- r = result.build()
- lgt = rutf8.check_utf8(r, True)
- return r, pos, lgt
+ return builder.build(), pos, builder.get_length()
_utf8_encode_unicode_escape = rutf8.make_utf8_escape_function()
diff --git a/pypy/module/_codecs/test/test_codecs.py
b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -120,6 +120,10 @@
{0: u'\U0010FFFF', 1: u'b', 2: u'c'}) ==
(u"\U0010FFFFbc", 3))
+ def test_escape_decode(self):
+ from _codecs import unicode_escape_decode as decode
+ assert decode('\\\x80') == (u'\\\x80', 2)
+
def test_escape_decode_errors(self):
from _codecs import escape_decode as decode
raises(ValueError, decode, br"\x")
@@ -327,10 +331,8 @@
for decode in [unicode_escape_decode, raw_unicode_escape_decode]:
for c, d in ('u', 4), ('U', 4):
for i in range(d):
- raises(UnicodeDecodeError, decode,
- "\\" + c + "0"*i)
- raises(UnicodeDecodeError, decode,
- "[\\" + c + "0"*i + "]")
+ raises(UnicodeDecodeError, decode, "\\" + c + "0"*i)
+ raises(UnicodeDecodeError, decode, "[\\" + c + "0"*i + "]")
data = "[\\" + c + "0"*i + "]\\" + c + "0"*i
assert decode(data, "ignore") == (u"[]", len(data))
assert decode(data, "replace") == (u"[\ufffd]\ufffd",
len(data))
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -706,18 +706,18 @@
@always_inline
def append_char(self, s):
# for characters, ascii
+ self._s.append(s)
self._lgt += 1
- self._s.append(s)
@try_inline
def append_code(self, code):
+ unichr_as_utf8_append(self._s, code, True)
self._lgt += 1
- unichr_as_utf8_append(self._s, code, True)
@always_inline
def append_utf8(self, utf8, length):
+ self._s.append(utf8)
self._lgt += length
- self._s.append(utf8)
@always_inline
def build(self):
diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py
--- a/rpython/rlib/test/test_rutf8.py
+++ b/rpython/rlib/test/test_rutf8.py
@@ -1,4 +1,4 @@
-import py
+import pytest
import sys
from hypothesis import given, strategies, settings, example
@@ -9,7 +9,8 @@
def test_unichr_as_utf8(c, allow_surrogates):
i = ord(c)
if not allow_surrogates and 0xD800 <= i <= 0xDFFF:
- py.test.raises(ValueError, rutf8.unichr_as_utf8, i, allow_surrogates)
+ with pytest.raises(ValueError):
+ rutf8.unichr_as_utf8(i, allow_surrogates)
else:
u = rutf8.unichr_as_utf8(i, allow_surrogates)
assert u == c.encode('utf8')
@@ -191,6 +192,13 @@
s.append_code(0xD800)
assert s.get_length() == 5
+def test_utf8_string_builder_bad_code():
+ s = rutf8.Utf8StringBuilder()
+ with pytest.raises(ValueError):
+ s.append_code(0x110000)
+ assert s.build() == ''
+ assert s.get_length() == 0
+
@given(strategies.text())
def test_utf8_iterator(arg):
u = rutf8.Utf8StringIterator(arg.encode('utf8'))
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit