Author: martin.v.loewis
Date: Sun Jul 29 20:10:01 2007
New Revision: 56621

Added:
   python/branches/py3k-struni/Lib/test/badsyntax_pep3120.py   (contents, props 
changed)
   python/branches/py3k-struni/Lib/test/test_pep3120.py
      - copied, changed from r56608, 
python/branches/py3k-struni/Lib/test/test_pep263.py
Modified:
   python/branches/py3k-struni/Misc/NEWS
   python/branches/py3k-struni/Parser/tokenizer.c
   python/branches/py3k-struni/Python/ast.c
Log:
Implement PEP 3120.


Added: python/branches/py3k-struni/Lib/test/badsyntax_pep3120.py
==============================================================================
--- (empty file)
+++ python/branches/py3k-struni/Lib/test/badsyntax_pep3120.py   Sun Jul 29 
20:10:01 2007
@@ -0,0 +1 @@
+print("b�se")

Copied: python/branches/py3k-struni/Lib/test/test_pep3120.py (from r56608, 
python/branches/py3k-struni/Lib/test/test_pep263.py)
==============================================================================
Binary files. No diff available.

Modified: python/branches/py3k-struni/Misc/NEWS
==============================================================================
--- python/branches/py3k-struni/Misc/NEWS       (original)
+++ python/branches/py3k-struni/Misc/NEWS       Sun Jul 29 20:10:01 2007
@@ -26,6 +26,8 @@
 Core and Builtins
 -----------------
 
+- PEP 3120: Change default encoding to UTF-8.
+
 - PEP 3123: Use proper C inheritance for PyObject.
 
 - Removed the __oct__ and __hex__ special methods and added a bin()

Modified: python/branches/py3k-struni/Parser/tokenizer.c
==============================================================================
--- python/branches/py3k-struni/Parser/tokenizer.c      (original)
+++ python/branches/py3k-struni/Parser/tokenizer.c      Sun Jul 29 20:10:01 2007
@@ -444,6 +444,34 @@
        ungetc(c, tok->fp);
 }
 
+/* Check whether the characters at s start a valid
+   UTF-8 sequence. Return the number of characters forming
+   the sequence if yes, 0 if not.  */
+static int valid_utf8(const unsigned char* s)
+{
+       int expected = 0;
+       int length;
+       if (*s < 0x80)
+               /* single-byte code */
+               return 1;
+       if (*s < 0xc0)
+               /* following byte */
+               return 0;
+       if (*s < 0xE0)
+               expected = 1;
+       else if (*s < 0xF0)
+               expected = 2;
+       else if (*s < 0xF8)
+               expected = 3;
+       else
+               return 0;
+       length = expected + 1;
+       for (; expected; expected--)
+               if (s[expected] < 0x80 || s[expected] >= 0xC0)
+                       return 0;
+       return length;
+}
+
 /* Read a line of input from TOK. Determine encoding
    if necessary.  */
 
@@ -478,12 +506,13 @@
                }
        }
 #ifndef PGEN
-       /* The default encoding is ASCII, so make sure we don't have any
-           non-ASCII bytes in it. */
+       /* The default encoding is UTF-8, so make sure we don't have any
+           non-UTF-8 sequences in it. */
        if (line && !tok->encoding) {
                unsigned char *c;
-               for (c = (unsigned char *)line; *c; c++)
-                       if (*c > 127) {
+               int length;
+               for (c = (unsigned char *)line; *c; c += length)
+                       if (!(length = valid_utf8(c))) {
                                badchar = *c;
                                break;
                        }
@@ -493,7 +522,7 @@
                /* Need to add 1 to the line number, since this line
                   has not been counted, yet.  */
                sprintf(buf,
-                       "Non-ASCII character '\\x%.2x' "
+                       "Non-UTF-8 code starting with '\\x%.2x' "
                        "in file %.200s on line %i, "
                        "but no encoding declared; "
                        "see http://www.python.org/peps/pep-0263.html for 
details",

Modified: python/branches/py3k-struni/Python/ast.c
==============================================================================
--- python/branches/py3k-struni/Python/ast.c    (original)
+++ python/branches/py3k-struni/Python/ast.c    Sun Jul 29 20:10:01 2007
@@ -203,7 +203,8 @@
         c.c_encoding = STR(n);
         n = CHILD(n, 0);
     } else {
-        c.c_encoding = NULL;
+       /* PEP 3120 */
+        c.c_encoding = "utf-8";
     }
     c.c_arena = arena;
 
_______________________________________________
Python-3000-checkins mailing list
[email protected]
http://mail.python.org/mailman/listinfo/python-3000-checkins

Reply via email to