[pypy-commit] pypy py3.5: Some more checks that comments contain valid utf-8

arigo Tue, 06 Dec 2016 04:36:08 -0800

Author: Armin Rigo <[email protected]>
Branch: py3.5
Changeset: r88901:77d31587155e
Date: 2016-12-06 13:34 +0100
http://bitbucket.org/pypy/pypy/changeset/77d31587155e/


Log:    Some more checks that comments contain valid utf-8

diff --git a/pypy/interpreter/pyparser/pytokenizer.py 
b/pypy/interpreter/pyparser/pytokenizer.py
--- a/pypy/interpreter/pyparser/pytokenizer.py
+++ b/pypy/interpreter/pyparser/pytokenizer.py
@@ -44,9 +44,21 @@
     return None
 
 
+def verify_utf8(token):
+    for c in token:
+        if ord(c) >= 0x80:
+            break
+    else:
+        return True
+    try:
+        u = token.decode('utf-8')
+    except UnicodeDecodeError:
+        return False
+    return True
+
 def verify_identifier(token):
     for c in token:
-        if ord(c) > 0x80:
+        if ord(c) >= 0x80:
             break
     else:
         return True
@@ -159,8 +171,14 @@
                 pos = pos + 1
             if pos == max: break
 
-            if line[pos] in '#\r\n':
-                # skip comments or blank lines
+            if line[pos] in '\r\n':
+                # skip blank lines
+                continue
+            if line[pos] == '#':
+                # skip full-line comment, but still check that it is valid 
utf-8
+                if not verify_utf8(line):
+                    raise TokenError("Non-UTF-8 code in comment",
+                                     line, lnum, pos, token_list)
                 continue
 
             if column == indents[-1]:
@@ -227,7 +245,10 @@
                         token_list.append(tok)
                     last_comment = ''
                 elif initial == '#':
-                    # skip comment
+                    # skip comment, but still check that it is valid utf-8
+                    if not verify_utf8(token):
+                        raise TokenError("Non-UTF-8 code in comment",
+                                         line, lnum, start, token_list)
                     last_comment = token
                 elif token in triple_quoted:
                     endDFA = endDFAs[token]
diff --git a/pypy/interpreter/test/test_compiler.py 
b/pypy/interpreter/test/test_compiler.py
--- a/pypy/interpreter/test/test_compiler.py
+++ b/pypy/interpreter/test/test_compiler.py
@@ -954,6 +954,27 @@
         else:
             assert False, "Expected SyntaxError"
 
+    def test_invalid_utf8_in_comments_or_strings(self):
+        import sys
+        compile(b"# coding: latin1\n#\xfd\n", "dummy", "exec")
+        raises(SyntaxError, compile, b"# coding: utf-8\n'\xfd'\n",
+               "dummy", "exec") #1
+        raises(SyntaxError, compile, b'# coding: utf-8\nx=5\nb"\xfd"\n',
+               "dummy", "exec") #2
+        # the following example still fails on CPython 3.5.2, skip if -A
+        if '__pypy__' in sys.builtin_module_names:
+            raises(SyntaxError, compile, b"# coding: utf-8\n#\xfd\n",
+                   "dummy", "exec") #3
+
+    def test_cpython_issues_24022_25388(self):
+        from _ast import PyCF_ACCEPT_NULL_BYTES
+        raises(SyntaxError, compile, b'0000\x00\n00000000000\n\x00\n\x9e\n',
+               "dummy", "exec", PyCF_ACCEPT_NULL_BYTES)
+        raises(SyntaxError, compile, b"#\x00\n#\xfd\n", "dummy", "exec",
+               PyCF_ACCEPT_NULL_BYTES)
+        raises(SyntaxError, compile, b"#\x00\nx=5#\xfd\n", "dummy", "exec",
+               PyCF_ACCEPT_NULL_BYTES)
+
     def test_dict_and_set_literal_order(self):
         x = 1
         l1 = list({1:'a', 3:'b', 2:'c', 4:'d'})
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy py3.5: Some more checks that comments contain valid utf-8

Reply via email to