Author: Armin Rigo <[email protected]>
Branch: py3.5
Changeset: r88901:77d31587155e
Date: 2016-12-06 13:34 +0100
http://bitbucket.org/pypy/pypy/changeset/77d31587155e/
Log: Some more checks that comments contain valid utf-8
diff --git a/pypy/interpreter/pyparser/pytokenizer.py
b/pypy/interpreter/pyparser/pytokenizer.py
--- a/pypy/interpreter/pyparser/pytokenizer.py
+++ b/pypy/interpreter/pyparser/pytokenizer.py
@@ -44,9 +44,21 @@
return None
+def verify_utf8(token):
+ for c in token:
+ if ord(c) >= 0x80:
+ break
+ else:
+ return True
+ try:
+ u = token.decode('utf-8')
+ except UnicodeDecodeError:
+ return False
+ return True
+
def verify_identifier(token):
for c in token:
- if ord(c) > 0x80:
+ if ord(c) >= 0x80:
break
else:
return True
@@ -159,8 +171,14 @@
pos = pos + 1
if pos == max: break
- if line[pos] in '#\r\n':
- # skip comments or blank lines
+ if line[pos] in '\r\n':
+ # skip blank lines
+ continue
+ if line[pos] == '#':
+ # skip full-line comment, but still check that it is valid
utf-8
+ if not verify_utf8(line):
+ raise TokenError("Non-UTF-8 code in comment",
+ line, lnum, pos, token_list)
continue
if column == indents[-1]:
@@ -227,7 +245,10 @@
token_list.append(tok)
last_comment = ''
elif initial == '#':
- # skip comment
+ # skip comment, but still check that it is valid utf-8
+ if not verify_utf8(token):
+ raise TokenError("Non-UTF-8 code in comment",
+ line, lnum, start, token_list)
last_comment = token
elif token in triple_quoted:
endDFA = endDFAs[token]
diff --git a/pypy/interpreter/test/test_compiler.py
b/pypy/interpreter/test/test_compiler.py
--- a/pypy/interpreter/test/test_compiler.py
+++ b/pypy/interpreter/test/test_compiler.py
@@ -954,6 +954,27 @@
else:
assert False, "Expected SyntaxError"
+ def test_invalid_utf8_in_comments_or_strings(self):
+ import sys
+ compile(b"# coding: latin1\n#\xfd\n", "dummy", "exec")
+ raises(SyntaxError, compile, b"# coding: utf-8\n'\xfd'\n",
+ "dummy", "exec") #1
+ raises(SyntaxError, compile, b'# coding: utf-8\nx=5\nb"\xfd"\n',
+ "dummy", "exec") #2
+ # the following example still fails on CPython 3.5.2, skip if -A
+ if '__pypy__' in sys.builtin_module_names:
+ raises(SyntaxError, compile, b"# coding: utf-8\n#\xfd\n",
+ "dummy", "exec") #3
+
+ def test_cpython_issues_24022_25388(self):
+ from _ast import PyCF_ACCEPT_NULL_BYTES
+ raises(SyntaxError, compile, b'0000\x00\n00000000000\n\x00\n\x9e\n',
+ "dummy", "exec", PyCF_ACCEPT_NULL_BYTES)
+ raises(SyntaxError, compile, b"#\x00\n#\xfd\n", "dummy", "exec",
+ PyCF_ACCEPT_NULL_BYTES)
+ raises(SyntaxError, compile, b"#\x00\nx=5#\xfd\n", "dummy", "exec",
+ PyCF_ACCEPT_NULL_BYTES)
+
def test_dict_and_set_literal_order(self):
x = 1
l1 = list({1:'a', 3:'b', 2:'c', 4:'d'})
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit