Author: martin.v.loewis
Date: Wed Aug 15 09:32:56 2007
New Revision: 57049

Added:
   python/branches/py3k/Lib/test/badsyntax_3131.py   (contents, props changed)
   python/branches/py3k/Lib/test/test_pep3131.py   (contents, props changed)
Modified:
   python/branches/py3k/Doc/lib/libstdtypes.tex
   python/branches/py3k/Include/errcode.h
   python/branches/py3k/Include/unicodeobject.h
   python/branches/py3k/Lib/test/test_unicode.py
   python/branches/py3k/Misc/NEWS
   python/branches/py3k/Objects/unicodeobject.c
   python/branches/py3k/Parser/tokenizer.c
   python/branches/py3k/Python/ast.c
   python/branches/py3k/Python/pythonrun.c
Log:
Implement PEP 3131. Add isidentifier to str.


Modified: python/branches/py3k/Doc/lib/libstdtypes.tex
==============================================================================
--- python/branches/py3k/Doc/lib/libstdtypes.tex        (original)
+++ python/branches/py3k/Doc/lib/libstdtypes.tex        Wed Aug 15 09:32:56 2007
@@ -653,6 +653,11 @@
 For 8-bit strings, this method is locale-dependent.
 \end{methoddesc}
 
+\begin{methoddesc}[str]{isidentifier}{}
+Return True if S is a valid identifier according\n\
+to the language definition.
+\end{methoddesc}
+
 \begin{methoddesc}[str]{islower}{}
 Return true if all cased characters in the string are lowercase and
 there is at least one cased character, false otherwise.

Modified: python/branches/py3k/Include/errcode.h
==============================================================================
--- python/branches/py3k/Include/errcode.h      (original)
+++ python/branches/py3k/Include/errcode.h      Wed Aug 15 09:32:56 2007
@@ -29,6 +29,7 @@
 #define E_EOFS         23      /* EOF in triple-quoted string */
 #define E_EOLS         24      /* EOL in single-quoted string */
 #define E_LINECONT     25      /* Unexpected characters after a line 
continuation */
+#define E_IDENTIFIER    26      /* Invalid characters in identifier */
 
 #ifdef __cplusplus
 }

Modified: python/branches/py3k/Include/unicodeobject.h
==============================================================================
--- python/branches/py3k/Include/unicodeobject.h        (original)
+++ python/branches/py3k/Include/unicodeobject.h        Wed Aug 15 09:32:56 2007
@@ -182,6 +182,7 @@
 # define PyUnicode_GetDefaultEncoding PyUnicodeUCS2_GetDefaultEncoding
 # define PyUnicode_GetMax PyUnicodeUCS2_GetMax
 # define PyUnicode_GetSize PyUnicodeUCS2_GetSize
+# define PyUnicode_IsIdentifier PyUnicodeUCS2_IsIdentifier
 # define PyUnicode_Join PyUnicodeUCS2_Join
 # define PyUnicode_Partition PyUnicodeUCS2_Partition
 # define PyUnicode_RPartition PyUnicodeUCS2_RPartition
@@ -268,6 +269,7 @@
 # define PyUnicode_GetDefaultEncoding PyUnicodeUCS4_GetDefaultEncoding
 # define PyUnicode_GetMax PyUnicodeUCS4_GetMax
 # define PyUnicode_GetSize PyUnicodeUCS4_GetSize
+# define PyUnicode_IsIdentifier PyUnicodeUCS4_IsIdentifier
 # define PyUnicode_Join PyUnicodeUCS4_Join
 # define PyUnicode_Partition PyUnicodeUCS4_Partition
 # define PyUnicode_RPartition PyUnicodeUCS4_RPartition
@@ -1250,6 +1252,10 @@
     PyObject *element          /* Element string */
     );
 
+/* Checks whether argument is a valid identifier. */
+
+PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
+
 /* Externally visible for str.strip(unicode) */
 PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
     PyUnicodeObject *self,

Added: python/branches/py3k/Lib/test/badsyntax_3131.py
==============================================================================
--- (empty file)
+++ python/branches/py3k/Lib/test/badsyntax_3131.py     Wed Aug 15 09:32:56 2007
@@ -0,0 +1,2 @@
+# -*- coding: utf-8 -*-
+€ = 2

Added: python/branches/py3k/Lib/test/test_pep3131.py
==============================================================================
--- (empty file)
+++ python/branches/py3k/Lib/test/test_pep3131.py       Wed Aug 15 09:32:56 2007
@@ -0,0 +1,29 @@
+# -*- coding: utf-8 -*-
+import unittest
+from test import test_support
+
+class PEP3131Test(unittest.TestCase):
+
+    def test_valid(self):
+        class T:
+            ä = 1
+            µ = 2 # this is a compatibility character
+            蟒 = 3
+        self.assertEquals(getattr(T, "\xe4"), 1)
+        self.assertEquals(getattr(T, "\u03bc"), 2)
+        self.assertEquals(getattr(T, '\u87d2'), 3)
+
+    def test_invalid(self):
+        try:
+            from test import badsyntax_3131
+        except SyntaxError as s:
+            self.assertEquals(str(s),
+              "invalid character in identifier (badsyntax_3131.py, line 2)")
+        else:
+            self.fail("expected exception didn't occur")
+
+def test_main():
+    test_support.run_unittest(PEP3131Test)
+
+if __name__=="__main__":
+    test_main()

Modified: python/branches/py3k/Lib/test/test_unicode.py
==============================================================================
--- python/branches/py3k/Lib/test/test_unicode.py       (original)
+++ python/branches/py3k/Lib/test/test_unicode.py       Wed Aug 15 09:32:56 2007
@@ -313,6 +313,19 @@
 
         self.assertRaises(TypeError, "abc".isnumeric, 42)
 
+    def test_isidentifier(self):
+        self.assertTrue("a".isidentifier())
+        self.assertTrue("Z".isidentifier())
+        self.assertTrue("_".isidentifier())
+        self.assertTrue("b0".isidentifier())
+        self.assertTrue("bc".isidentifier())
+        self.assertTrue("b_".isidentifier())
+        self.assertTrue("�".isidentifier())
+
+        self.assertFalse(" ".isidentifier())
+        self.assertFalse("[".isidentifier())
+        self.assertFalse("�".isidentifier())
+
     def test_contains(self):
         # Testing Unicode contains method
         self.assert_('a' in 'abdb')

Modified: python/branches/py3k/Misc/NEWS
==============================================================================
--- python/branches/py3k/Misc/NEWS      (original)
+++ python/branches/py3k/Misc/NEWS      Wed Aug 15 09:32:56 2007
@@ -26,6 +26,8 @@
 Core and Builtins
 -----------------
 
+- PEP 3131: Support non-ASCII identifiers.
+
 - PEP 3120: Change default encoding to UTF-8.
 
 - PEP 3123: Use proper C inheritance for PyObject.

Modified: python/branches/py3k/Objects/unicodeobject.c
==============================================================================
--- python/branches/py3k/Objects/unicodeobject.c        (original)
+++ python/branches/py3k/Objects/unicodeobject.c        Wed Aug 15 09:32:56 2007
@@ -227,7 +227,8 @@
 }
 
 /* We allocate one more byte to make sure the string is
-   Ux0000 terminated -- XXX is this needed ?
+   Ux0000 terminated; some code (e.g. new_identifier)
+   relies on that.
 
    XXX This allocator could further be enhanced by assuring that the
        free list never reduces its size below 1.
@@ -6679,6 +6680,47 @@
     return PyBool_FromLong(1);
 }
 
+int
+PyUnicode_IsIdentifier(PyObject *self)
+{
+    register const Py_UNICODE *p = 
PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
+    register const Py_UNICODE *e;
+
+    /* Special case for empty strings */
+    if (PyUnicode_GET_SIZE(self) == 0)
+       return 0;
+
+    /* PEP 3131 says that the first character must be in
+       XID_Start and subsequent characters in XID_Continue,
+       and for the ASCII range, the 2.x rules apply (i.e
+       start with letters and underscore, continue with 
+       letters, digits, underscore). However, given the current
+       definition of XID_Start and XID_Continue, it is sufficient
+       to check just for these, except that _ must be allowed
+       as starting an identifier.  */
+    if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
+        return 0;
+
+    e = p + PyUnicode_GET_SIZE(self);
+    for (p++; p < e; p++) {
+       if (!_PyUnicode_IsXidContinue(*p))
+           return 0;
+    }
+    return 1;
+}
+
+PyDoc_STRVAR(isidentifier__doc__,
+"S.isidentifier() -> bool\n\
+\n\
+Return True if S is a valid identifier according\n\
+to the language definition.");
+
+static PyObject*
+unicode_isidentifier(PyObject *self)
+{
+    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
+}
+
 PyDoc_STRVAR(join__doc__,
 "S.join(sequence) -> unicode\n\
 \n\
@@ -7714,6 +7756,7 @@
     {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, 
isnumeric__doc__},
     {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
     {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
+    {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, 
isidentifier__doc__},
     {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
 #if 0
     {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},

Modified: python/branches/py3k/Parser/tokenizer.c
==============================================================================
--- python/branches/py3k/Parser/tokenizer.c     (original)
+++ python/branches/py3k/Parser/tokenizer.c     Wed Aug 15 09:32:56 2007
@@ -21,13 +21,15 @@
 #define is_potential_identifier_start(c) (\
                           (c >= 'a' && c <= 'z')\
                       || (c >= 'A' && c <= 'Z')\
-                      || c == '_')
+                      || c == '_'\
+                      || (c >= 128))
 
 #define is_potential_identifier_char(c) (\
                           (c >= 'a' && c <= 'z')\
                       || (c >= 'A' && c <= 'Z')\
                       || (c >= '0' && c <= '9')\
-                      || c == '_')
+                      || c == '_'\
+                      || (c >= 128))
 
 extern char *PyOS_Readline(FILE *, FILE *, char *);
 /* Return malloc'ed string including trailing \n;
@@ -1070,6 +1072,19 @@
        return 0;
 }
 
+#ifdef PGEN
+#define verify_identifier(s,e) 1
+#else
+/* Verify that the identifier follows PEP 3131. */
+static int
+verify_identifier(char *start, char *end)
+{
+       PyObject *s = PyUnicode_DecodeUTF8(start, end-start, NULL);
+       int result = PyUnicode_IsIdentifier(s);
+       Py_DECREF(s);
+       return result;
+}
+#endif
 
 /* Get next token, after space stripping etc. */
 
@@ -1077,7 +1092,7 @@
 tok_get(register struct tok_state *tok, char **p_start, char **p_end)
 {
        register int c;
-       int blankline;
+       int blankline, nonascii;
 
        *p_start = *p_end = NULL;
   nextline:
@@ -1195,6 +1210,7 @@
        }
 
        /* Identifier (most frequent token!) */
+       nonascii = 0;
        if (is_potential_identifier_start(c)) {
                /* Process r"", u"" and ur"" */
                switch (c) {
@@ -1214,9 +1230,16 @@
                        break;
                }
                while (is_potential_identifier_char(c)) {
+                       if (c >= 128)
+                               nonascii = 1;
                        c = tok_nextc(tok);
                }
                tok_backup(tok, c);
+               if (nonascii && 
+                   !verify_identifier(tok->start, tok->cur)) {
+                       tok->done = E_IDENTIFIER;
+                       return ERRORTOKEN;
+               }
                *p_start = tok->start;
                *p_end = tok->cur;
                return NAME;

Modified: python/branches/py3k/Python/ast.c
==============================================================================
--- python/branches/py3k/Python/ast.c   (original)
+++ python/branches/py3k/Python/ast.c   Wed Aug 15 09:32:56 2007
@@ -47,8 +47,27 @@
 #define COMP_SETCOMP  2
 
 static identifier
-new_identifier(const char* n, PyArena *arena) {
+new_identifier(const char* n, PyArena *arena)
+{
     PyObject* id = PyUnicode_DecodeUTF8(n, strlen(n), NULL);
+    Py_UNICODE *u = PyUnicode_AS_UNICODE(id);
+    /* Check whether there are non-ASCII characters in the
+       identifier; if so, normalize to NFKC. */
+    for (; *u; u++) {
+       if (*u >= 128) {
+           PyObject *m = PyImport_ImportModule("unicodedata");
+           PyObject *id2;
+           if (!m)
+               return NULL;
+           id2 = PyObject_CallMethod(m, "normalize", "sO", "NFKC", id);
+           Py_DECREF(m);
+           if (!id2)
+               return NULL;
+           Py_DECREF(id);
+           id = id2;
+           break;
+       }
+    }
     PyUnicode_InternInPlace(&id);
     PyArena_AddPyObject(arena, id);
     return id;

Modified: python/branches/py3k/Python/pythonrun.c
==============================================================================
--- python/branches/py3k/Python/pythonrun.c     (original)
+++ python/branches/py3k/Python/pythonrun.c     Wed Aug 15 09:32:56 2007
@@ -1530,6 +1530,10 @@
        case E_LINECONT:
                msg = "unexpected character after line continuation character";
                break;
+
+       case E_IDENTIFIER:
+               msg = "invalid character in identifier";
+               break;
        default:
                fprintf(stderr, "error=%d\n", err->error);
                msg = "unknown parsing error";
_______________________________________________
Python-3000-checkins mailing list
[email protected]
http://mail.python.org/mailman/listinfo/python-3000-checkins

Reply via email to