This exposes all of libclang's token functions in the Python bindings.

I'm putting the TokenKind enumerations in a new module,
clang.enumerations. I plan to eventually move all existing
enumerations there so they are all consolidated. And, maybe one day,
we can even generate that file automatically by parsing the libclang
header files. I know Anders has code around somewhere that does
this...

---
 bindings/python/clang/cindex.py                    | 177 +++++++++++++++++++--
 bindings/python/clang/enumerations.py              |  32 ++++
 bindings/python/tests/cindex/test_token_kind.py    |  43 +++++
 bindings/python/tests/cindex/test_tokens.py        |  52 ++++++
 .../python/tests/cindex/test_translation_unit.py   |  24 ++-
 5 files changed, 312 insertions(+), 16 deletions(-)
 create mode 100644 bindings/python/clang/enumerations.py
 create mode 100644 bindings/python/tests/cindex/test_token_kind.py
 create mode 100644 bindings/python/tests/cindex/test_tokens.py
diff --git a/bindings/python/clang/cindex.py b/bindings/python/clang/cindex.py
index 329e7ae..89a9b68 100644
--- a/bindings/python/clang/cindex.py
+++ b/bindings/python/clang/cindex.py
@@ -60,16 +60,18 @@ call is efficient.
 # o cleanup ctypes wrapping, would be nice to separate the ctypes details more
 #   clearly, and hide from the external interface (i.e., help(cindex)).
 #
 # o implement additional SourceLocation, SourceRange, and File methods.
 
 from ctypes import *
 import collections
 
+import clang.enumerations
+
 def get_cindex_library():
     # FIXME: It's probably not the case that the library is actually found in
     # this location. We need a better system of identifying and loading the
     # CIndex library. It could be on path or elsewhere, or versioned, etc.
     import platform
     name = platform.system()
     if name == 'Darwin':
         return cdll.LoadLibrary('libclang.dylib')
@@ -362,16 +364,53 @@ class FixIt(object):
 
     def __init__(self, range, value):
         self.range = range
         self.value = value
 
     def __repr__(self):
         return "<FixIt range %r, value %r>" % (self.range, self.value)
 
+class TokenKind(object):
+    """Describes a specific type of a Token."""
+
+    _value_map = {} # int -> TokenKind
+
+    def __init__(self, value, name):
+        """Create a new TokenKind instance from a numeric value and a name."""
+        self.value = value
+        self.name = name
+
+    def __repr__(self):
+        return 'TokenKind.%s' % (self.name,)
+
+    @staticmethod
+    def from_value(value):
+        """Obtain a registered TokenKind instance from its value."""
+        result = TokenKind._value_map.get(value, None)
+
+        if result is None:
+            raise ValueError('Unknown TokenKind: %d' % value)
+
+        return result
+
+    @staticmethod
+    def register(value, name):
+        """Register a new TokenKind enumeration.
+
+This should only be called at module load time by code within this
+package.
+"""
+        if value in TokenKind._value_map:
+            raise ValueError('TokenKind already registered: %d' % value)
+
+        kind = TokenKind(value, name)
+        TokenKind._value_map[value] = kind
+        setattr(TokenKind, name, kind)
+
 ### Cursor Kinds ###
 
 class CursorKind(object):
     """
     A CursorKind describes the kind of entity that a cursor points to.
     """
 
     # The unique kind objects, indexed by id.
@@ -2042,16 +2081,72 @@ class TranslationUnit(ClangObject):
                 unsaved_files_array[i].contents = value
                 unsaved_files_array[i].length = len(value)
         ptr = lib.clang_codeCompleteAt(self, path, line, column,
                 unsaved_files_array, len(unsaved_files), options)
         if ptr:
             return CodeCompletionResults(ptr)
         return None
 
+    class TokenGroup(object):
+        """Internal class to facilitate reference counting of tokens for GC.
+
+        Tokens are created in groups. And, each group is allocated in one
+        chunk, which means it needs to be disposed as a group.
+
+        This class exists so individual tokens can reference the group they
+        came from. When the last token from a group is GC'd by Python, the
+        TokenGroup it was associated from will also be GC'd. This will actually
+        free memory associated with all the tokens.
+        """
+        def __init__(self, tu, memory, count):
+            self._tu = tu
+            self._memory = memory
+            self._count = count
+
+        def __del__(self):
+            lib.clang_disposeTokens(self._tu, self._memory, self._count)
+
+    def get_tokens(self, start_location=None, end_location=None,
+            source_range=None):
+        """Obtain tokens in this translation unit.
+
+        This is a generator for Token instances. The caller specifies a range
+        of source code to obtain tokens for. The range can be specified with
+        either two SourceLocation instances or a single SourceRange. The
+        function then returns all the Token instances in the requested range.
+        """
+        use_range = None
+
+        if source_range is not None:
+            use_range = source_range
+        else:
+            use_range = SourceRange(start=start_location, end=end_location)
+
+        tokens_memory = POINTER(Token)()
+        tokens_count = c_uint()
+
+        lib.clang_tokenize(self, use_range, byref(tokens_memory),
+                byref(tokens_count))
+
+        count = int(tokens_count.value)
+        tokens_array = cast(tokens_memory, POINTER(Token * count)).contents
+
+        token_group = TranslationUnit.TokenGroup(self, tokens_memory,
+                tokens_count)
+
+        for i in xrange(0, count):
+            token = Token()
+            token.int_data = tokens_array[i].int_data
+            token.ptr_data = tokens_array[i].ptr_data
+            token._tu = self
+            token._group = token_group
+
+            yield token
+
 class File(ClangObject):
     """
     The File class represents a particular source file that is part of a
     translation unit.
     """
 
     @staticmethod
     def from_name(translation_unit, file_name):
@@ -2097,32 +2192,78 @@ class FileInclusion(object):
         self.location = loc
         self.depth = depth
 
     @property
     def is_input_file(self):
         """True if the included file is the input file."""
         return self.depth == 0
 
+class Token(Structure):
+    """Represents a single token from the preprocessor.
+
+    Tokens are effectively segments of source code. Source code is first parsed
+    into tokens before being converted into the AST and Cursors.
+
+    Tokens are obtained from parsed TranslationUnit instances. You currently
+    can't create tokens manually.
+    """
+    _fields_ = [
+        ('int_data', c_uint * 4),
+        ('ptr_data', c_void_p)
+    ]
+
+    @property
+    def spelling(self):
+        """The spelling of this token.
+
+        This is the textual representation of the token in source.
+        """
+        return lib.clang_getTokenSpelling(self._tu, self)
+
+    @property
+    def kind(self):
+        """Obtain the TokenKind of the current token."""
+        return TokenKind.from_value(lib.clang_getTokenKind(self))
+
+    @property
+    def location(self):
+        """The SourceLocation this Token occurs at."""
+        return lib.clang_getTokenLocation(self._tu, self)
+
+    @property
+    def extent(self):
+        """The SourceRange this Token occupies."""
+        return lib.clang_getTokenExtent(self._tu, self)
+
+    @property
+    def cursor(self):
+        """The Cursor this Token corresponds to."""
+        cursor = Cursor()
+
+        lib.clang_annotateTokens(self._tu, byref(self), 1, byref(cursor))
+
+        return cursor
+
 # Now comes the plumbing to hook up the C library.
 
 # Register callback types in common container.
 callbacks['translation_unit_includes'] = CFUNCTYPE(None, c_object_p,
         POINTER(SourceLocation), c_uint, py_object)
 callbacks['cursor_visit'] = CFUNCTYPE(c_int, Cursor, Cursor, py_object)
 
 def register_functions(lib):
     """Register function prototypes with a libclang library instance.
 
     This must be called as part of library instantiation so Python knows how
     to call out to the shared library.
     """
     # Functions are registered in strictly alphabetical order.
-    #lib.clang_annotateTokens.argtype = [TranslationUnit, POINTER(Token),
-    #                                    c_uint, POINTER(Cursor)]
+    lib.clang_annotateTokens.argtype = [TranslationUnit, POINTER(Token),
+                                        c_uint, POINTER(Cursor)]
 
     lib.clang_codeCompleteAt.argtypes = [TranslationUnit, c_char_p, c_int,
             c_int, c_void_p, c_int, c_int]
     lib.clang_codeCompleteAt.restype = POINTER(CCRStructure)
 
     lib.clang_codeCompleteGetDiagnostic.argtypes = [CodeCompletionResults,
             c_int]
     lib.clang_codeCompleteGetDiagnostic.restype = Diagnostic
@@ -2150,17 +2291,17 @@ def register_functions(lib):
     #lib.clang_disposeCXTUResourceUsage.argtypes = [CXTUResourceUsage]
 
     lib.clang_disposeDiagnostic.argtypes = [Diagnostic]
 
     lib.clang_disposeIndex.argtypes = [Index]
 
     lib.clang_disposeString.argtypes = [_CXString]
 
-    #lib.clang_disposeTokens.argtype = [TranslationUnit, POINTER(Token), c_uint]
+    lib.clang_disposeTokens.argtype = [TranslationUnit, POINTER(Token), c_uint]
 
     lib.clang_disposeTranslationUnit.argtypes = [TranslationUnit]
 
     lib.clang_equalCursors.argtypes = [Cursor, Cursor]
     lib.clang_equalCursors.restype = bool
 
     lib.clang_equalLocations.argtypes = [SourceLocation, SourceLocation]
     lib.clang_equalLocations.restype = bool
@@ -2385,29 +2526,28 @@ def register_functions(lib):
 
     lib.clang_getSpecializedCursorTemplate.argtypes = [Cursor]
     lib.clang_getSpecializedCursorTemplate.restype = Cursor
     lib.clang_getSpecializedCursorTemplate.errcheck = Cursor.from_cursor_result
 
     lib.clang_getTemplateCursorKind.argtypes = [Cursor]
     lib.clang_getTemplateCursorKind.restype = c_uint
 
-    #lib.clang_getTokenExtent.argtypes = [TranslationUnit, Token]
-    #lib.clang_getTokenExtent.restype = SourceRange
+    lib.clang_getTokenExtent.argtypes = [TranslationUnit, Token]
+    lib.clang_getTokenExtent.restype = SourceRange
 
-    #lib.clang_getTokenKind.argtypes = [Token]
-    #lib.clang_getTokenKind.restype = c_uint
-    #lib.clang_getTokenKind.errcheck = TokenKind.from_result
+    lib.clang_getTokenKind.argtypes = [Token]
+    lib.clang_getTokenKind.restype = c_uint
 
-    #lib.clang_getTokenLocation.argtype = [TranslationUnit, Token]
-    #lib.clang_getTokenLocation.restype = SourceLocation
+    lib.clang_getTokenLocation.argtype = [TranslationUnit, Token]
+    lib.clang_getTokenLocation.restype = SourceLocation
 
-    #lib.clang_getTokenSpelling.argtype = [TranslationUnit, Token]
-    #lib.clang_getTokenSpelling.restype = _CXString
-    #lib.clang_getTokenSpelling.errcheck = _CXString.from_result
+    lib.clang_getTokenSpelling.argtype = [TranslationUnit, Token]
+    lib.clang_getTokenSpelling.restype = _CXString
+    lib.clang_getTokenSpelling.errcheck = _CXString.from_result
 
     lib.clang_getTranslationUnitCursor.argtypes = [TranslationUnit]
     lib.clang_getTranslationUnitCursor.restype = Cursor
     lib.clang_getTranslationUnitCursor.errcheck = Cursor.from_result
 
     lib.clang_getTranslationUnitSpelling.argtypes = [TranslationUnit]
     lib.clang_getTranslationUnitSpelling.restype = _CXString
     lib.clang_getTranslationUnitSpelling.errcheck = _CXString.from_result
@@ -2488,33 +2628,40 @@ def register_functions(lib):
     lib.clang_reparseTranslationUnit.argtypes = [TranslationUnit, c_int,
             c_void_p, c_int]
     lib.clang_reparseTranslationUnit.restype = c_int
 
     lib.clang_saveTranslationUnit.argtypes = [TranslationUnit, c_char_p,
             c_uint]
     lib.clang_saveTranslationUnit.restype = c_int
 
-    #lib.clang_tokenize.argtypes = [TranslationUnit, SourceRange,
-    #        POINTER(POINTER(Token)), POINTER(c_uint)]
+    lib.clang_tokenize.argtypes = [TranslationUnit, SourceRange,
+            POINTER(POINTER(Token)), POINTER(c_uint)]
 
     lib.clang_visitChildren.argtypes = [Cursor, callbacks['cursor_visit'],
             py_object]
     lib.clang_visitChildren.restype = c_uint
 
 register_functions(lib)
 
+def register_enumerations():
+    for name, value in clang.enumerations.TokenKinds:
+        TokenKind.register(value, name)
+
+register_enumerations()
 
 __all__ = [
     'CodeCompletionResults',
     'CursorKind',
     'Cursor',
     'Diagnostic',
     'File',
     'FixIt',
     'Index',
     'SourceLocation',
     'SourceRange',
+    'TokenKind',
+    'Token',
     'TranslationUnitLoadError',
     'TranslationUnit',
     'TypeKind',
     'Type',
 ]
diff --git a/bindings/python/clang/enumerations.py b/bindings/python/clang/enumerations.py
new file mode 100644
index 0000000..12e82ed
--- /dev/null
+++ b/bindings/python/clang/enumerations.py
@@ -0,0 +1,32 @@
+#===- enumerations.py - Python Enumerations ------------------*- python -*--===#
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+#===------------------------------------------------------------------------===#
+
+"""
+Clang Enumerations
+==================
+
+This module provides static definitions of enumerations that exist in libclang.
+
+Enumerations are typically defined as a list of tuples. The exported values are
+typically munged into other types or classes at module load time.
+
+All enumerations are centrally defined in this file so they are all grouped
+together and easier to audit. And, maybe even one day this file will be
+automatically generated by scanning the libclang headers!
+"""
+
+TokenKinds = [
+    ('PUNCTUATION', 0),
+    ('KEYWORD', 1),
+    ('IDENTIFIER', 2),
+    ('LITERAL', 3),
+    ('COMMENT', 4),
+]
+
+__all__ = ['TokenKinds']
diff --git a/bindings/python/tests/cindex/test_token_kind.py b/bindings/python/tests/cindex/test_token_kind.py
new file mode 100644
index 0000000..62ec63e
--- /dev/null
+++ b/bindings/python/tests/cindex/test_token_kind.py
@@ -0,0 +1,43 @@
+from clang.cindex import TokenKind
+from nose.tools import eq_
+from nose.tools import ok_
+from nose.tools import raises
+
+def test_constructor():
+    """Ensure TokenKind constructor works as expected."""
+
+    t = TokenKind(5, 'foo')
+
+    eq_(t.value, 5)
+    eq_(t.name, 'foo')
+
+@raises(ValueError)
+def test_bad_register():
+    """Ensure a duplicate value is rejected for registration."""
+
+    TokenKind.register(2, 'foo')
+
+@raises(ValueError)
+def test_unknown_value():
+    """Ensure trying to fetch an unknown value raises."""
+
+    TokenKind.from_value(-1)
+
+def test_registration():
+    """Ensure that items registered appear as class attributes."""
+    ok_(hasattr(TokenKind, 'LITERAL'))
+    literal = TokenKind.LITERAL
+
+    ok_(isinstance(literal, TokenKind))
+
+def test_from_value():
+    """Ensure registered values can be obtained from from_value()."""
+    t = TokenKind.from_value(3)
+    ok_(isinstance(t, TokenKind))
+    eq_(t, TokenKind.LITERAL)
+
+def test_repr():
+    """Ensure repr() works."""
+
+    r = repr(TokenKind.LITERAL)
+    eq_(r, 'TokenKind.LITERAL')
diff --git a/bindings/python/tests/cindex/test_tokens.py b/bindings/python/tests/cindex/test_tokens.py
new file mode 100644
index 0000000..18448f4
--- /dev/null
+++ b/bindings/python/tests/cindex/test_tokens.py
@@ -0,0 +1,52 @@
+from clang.cindex import CursorKind
+from clang.cindex import Index
+from clang.cindex import SourceLocation
+from clang.cindex import SourceRange
+from clang.cindex import TokenKind
+from nose.tools import eq_
+from nose.tools import ok_
+
+from .util import get_tu
+
+def test_token_to_cursor():
+    """Ensure we can obtain a Cursor from a Token instance."""
+    tu = get_tu('int i = 5;')
+    r = tu.get_source_range('t.c', start_offset=0, end_offset=9)
+    tokens = list(tu.get_tokens(source_range=r))
+
+    assert len(tokens) == 5
+    assert tokens[1].spelling == 'i'
+    assert tokens[1].kind == TokenKind.IDENTIFIER
+
+    cursor = tokens[1].cursor
+    assert cursor.kind == CursorKind.VAR_DECL
+    assert tokens[1].cursor == tokens[2].cursor
+
+def test_token_location():
+    """Ensure Token.location works."""
+
+    tu = get_tu('int foo = 10;')
+    r = tu.get_source_range('t.c', start_offset=0, end_offset=11)
+
+    tokens = list(tu.get_tokens(source_range=r))
+    eq_(len(tokens), 4)
+
+    loc = tokens[1].location
+    ok_(isinstance(loc, SourceLocation))
+    eq_(loc.line, 1)
+    eq_(loc.column, 5)
+    eq_(loc.offset, 4)
+
+def test_token_extent():
+    """Ensure Token.extent works."""
+    tu = get_tu('int foo = 10;')
+    r = tu.get_source_range('t.c', start_offset=0, end_offset=11)
+
+    tokens = list(tu.get_tokens(source_range=r))
+    eq_(len(tokens), 4)
+
+    extent = tokens[1].extent
+    ok_(isinstance(extent, SourceRange))
+
+    eq_(extent.start.offset, 4)
+    eq_(extent.end.offset, 7)
diff --git a/bindings/python/tests/cindex/test_translation_unit.py b/bindings/python/tests/cindex/test_translation_unit.py
index c883f39..0cbe397 100644
--- a/bindings/python/tests/cindex/test_translation_unit.py
+++ b/bindings/python/tests/cindex/test_translation_unit.py
@@ -1,19 +1,21 @@
+import gc
+import os
+
 from clang.cindex import CursorKind
 from clang.cindex import Cursor
 from clang.cindex import File
 from clang.cindex import Index
 from clang.cindex import SourceLocation
 from clang.cindex import SourceRange
 from clang.cindex import TranslationUnitSaveError
 from clang.cindex import TranslationUnit
 from .util import get_cursor
 from .util import get_tu
-import os
 
 kInputsDir = os.path.join(os.path.dirname(__file__), 'INPUTS')
 
 def test_spelling():
     path = os.path.join(kInputsDir, 'hello.cpp')
     tu = TranslationUnit.from_source(path)
     assert tu.spelling == path
 
@@ -212,8 +214,28 @@ def test_get_source_range():
     end = tu.get_source_location('t.c', offset=5)
 
     r = tu.get_source_range('t.c', start_location=start, end_location=end)
     assert isinstance(r, SourceRange)
     assert r.start.offset == 0
     assert r.end.offset == 5
     assert r.start.file.name == 't.c'
     assert r.end.file.name == 't.c'
+
+def test_get_tokens_gc():
+    """Ensures get_tokens() works properly with garbage collection."""
+
+    tu = get_tu('int foo();')
+    r = tu.get_source_range('t.c', start_offset=0, end_offset=10)
+    tokens = list(tu.get_tokens(source_range=r))
+
+    assert tokens[0].spelling == 'int'
+    gc.collect()
+    assert tokens[0].spelling == 'int'
+
+    del tokens[1]
+    gc.collect()
+    assert tokens[0].spelling == 'int'
+
+    # May trigger segfault if we don't do our job properly.
+    del tokens
+    gc.collect()
+    gc.collect() # Just in case.
_______________________________________________
cfe-commits mailing list
[email protected]
http://lists.cs.uiuc.edu/mailman/listinfo/cfe-commits

Reply via email to