[Lldb-commits] [lldb] [lldb] Add tree-sitter based Swift syntax highlighting (PR #181297)

Jonas Devlieghere via lldb-commits Mon, 16 Feb 2026 14:39:57 -0800

================
@@ -0,0 +1,1594 @@
+"use strict";
+/*
+ * MIT License
+ *
+ * Copyright (c) 2021 alex-pinkus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to 
deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 
THE
+ * SOFTWARE.
+ */
+const PRECS = {
+  multiplication: 11,
+  addition: 10,
+  infix_operations: 9,
+  nil_coalescing: 8,
+  check: 7,
+  prefix_operations: 7,
+  comparison: 6,
+  postfix_operations: 6,
+  equality: 5,
+  conjunction: 4,
+  disjunction: 3,
+  block: 2,
+  loop: 1,
+  keypath: 1,
+  parameter_pack: 1,
+  control_transfer: 0,
+  as: -1,
+  tuple: -1,
+  if: -1,
+  switch: -1,
+  do: -1,
+  fully_open_range: -1,
+  range: -1,
+  navigation: -1,
+  expr: -1,
+  ty: -1,
+  call: -2,
+  ternary: -2,
+  try: -2,
+  call_suffix: -2,
+  range_suffix: -2,
+  ternary_binary_suffix: -2,
+  await: -2,
+  assignment: -3,
+  comment: -3,
+  lambda: -3,
+  regex: -4,
+};
+
+const DYNAMIC_PRECS = {
+  call : 1,
+};
+
+const DEC_DIGITS = token(sep1(/[0-9]+/, /_+/));
+const HEX_DIGITS = token(sep1(/[0-9a-fA-F]+/, /_+/));
+const OCT_DIGITS = token(sep1(/[0-7]+/, /_+/));
+const BIN_DIGITS = token(sep1(/[01]+/, /_+/));
+const REAL_EXPONENT = token(seq(/[eE]/, optional(/[+-]/), DEC_DIGITS));
+const HEX_REAL_EXPONENT = token(seq(/[pP]/, optional(/[+-]/), DEC_DIGITS));
+
+var LEXICAL_IDENTIFIER;
+
+if (tree_sitter_version_supports_emoji()) {
+  LEXICAL_IDENTIFIER =
+      
/[_\p{XID_Start}\p{Emoji}&&[^0-9#*]](\p{EMod}|\x{FE0F}\x{20E3}?)?([_\p{XID_Continue}\p{Emoji}\x{200D}](\p{EMod}|\x{FE0F}\x{20E3}?)?)*/;
+} else {
+  LEXICAL_IDENTIFIER = /[_\p{XID_Start}][_\p{XID_Continue}]*/;
+}
+
+module.exports = grammar({
+  name : "swift",
+  conflicts : ($) => [
+                // @Type(... could either be an annotation constructor
+                // invocation or an annotated expression
+                [ $.attribute ],
+                [ $._attribute_argument ],
+                // Is `foo { ... }` a constructor invocation or function
+                // invocation?
+                [ $._simple_user_type, $._expression ],
+                // To support nested types A.B not being interpreted as
+                // `(navigation_expression ... (type_identifier))
+                // (navigation_suffix)`
+                [ $.user_type ],
+                // How to tell the difference between Foo.bar(with:and:), and
+                // Foo.bar(with: smth, and: other)? You need GLR
+                [ $.value_argument ],
+                // { (foo, bar) ...
+                [ $._expression, $.lambda_parameter ],
+                [ $._primary_expression, $.lambda_parameter ],
+                // (start: start, end: end)
+                [ $._tuple_type_item_identifier, $.tuple_expression ],
+                // After a `{` in a function or switch context, it's ambigous
+                // whether we're starting a set of local statements or applying
+                // some modifiers to a capture or pattern.
+                [ $.modifiers ],
+                // `+(...)` is ambigously either "call the function produced by
+                // a reference to the operator `+`" or "use the unary operator
+                // `+` on the result of the parenthetical expression."
+                [ $._additive_operator, $._prefix_unary_operator ],
+                [ $._referenceable_operator, $._prefix_unary_operator ],
+                // `{ [self, b, c] ...` could be a capture list or an array
+                // literal depending on what else happens.
+                [ $.capture_list_item, $._expression ],
+                [ $.capture_list_item, $._expression, $._simple_user_type ],
+                [ $._primary_expression, $.capture_list_item ],
+                // a ? b : c () could be calling c(), or it could be calling a
+                // function that's produced by the result of
+                // `(a ? b : c)`. We have a small hack to force it to be the
+                // former of these by intentionally introducing a conflict.
+                [ $.call_suffix, $.expr_hack_at_ternary_binary_call_suffix ],
+                // try {expression} is a bit magic and applies quite broadly:
+                // `try foo()` and `try foo { }` show that this is right
+                // associative, and `try foo ? bar() : baz` even more so. But 
it
+                // doesn't always win: something like `if try foo { } ...`
+                // should award its braces to the `if`. In order to make this
+                // actually happen, we need to parse all the options and pick
+                // the best one that doesn't error out.
+                [ $.try_expression, $._unary_expression ],
+                [ $.try_expression, $._expression ],
+                // await {expression} has the same special cases as `try`.
+                [ $.await_expression, $._unary_expression ],
+                [ $.await_expression, $._expression ],
+                // In a computed property, when you see an @attribute, it's not
+                // yet clear if that's going to be for a locally-declared class
+                // or a getter / setter specifier.
+                [
+                  $._local_property_declaration,
+                  $._local_typealias_declaration,
+                  $._local_function_declaration,
+                  $._local_class_declaration,
+                  $.computed_getter,
+                  $.computed_modify,
+                  $.computed_setter,
+                ],
+                // The `class` modifier is legal in many of the same positions
+                // that a class declaration itself would be.
+                [ $._bodyless_function_declaration, $.property_modifier ],
+                [ $.init_declaration, $.property_modifier ],
+                // Patterns, man
+                [ $._navigable_type_expression, $._case_pattern ],
+                [
+                  $._no_expr_pattern_already_bound, $._binding_pattern_no_expr
+                ],
+
+                // On encountering a closure starting with `{ @Foo ...`, we
+                // don't yet know if that attribute applies to the closure type
+                // or to a declaration within the closure. What a mess! We just
+                // have to hope that if we keep going, only one of those will
+                // parse (because there will be an `in` or a `let`).
+                [
+                  $._lambda_type_declaration,
+                  $._local_property_declaration,
+                  $._local_typealias_declaration,
+                  $._local_function_declaration,
+                  $._local_class_declaration,
+                ],
+
+                // We want `foo() { }` to be treated as one function call, but
+                // we _also_ want `if foo() { ... }` to be treated as a full
+                // if-statement. This means we have to treat it as a conflict
+                // rather than purely a left or right associative construct, 
and
+                // let the parser realize that the second expression won't 
parse
+                // properly with the `{ ... }` as a lambda.
+                [ $.constructor_suffix ],
+                [ $.call_suffix ],
+
+                // `actor` is allowed to be an identifier, even though it is
+                // also a locally permitted declaration. If we encounter it, 
the
+                // only way to know what it's meant to be is to keep going.
+                [ $._modifierless_class_declaration, $.property_modifier ],
+                [ $._fn_call_lambda_arguments ],
+
+                // `borrowing` and `consuming` are legal as identifiers, but 
are
+                // also legal modifiers
+                [ $.parameter_modifiers ],
+
+                // These are keywords sometimes, but simple identifiers other
+                // times, and it just depends on the rest of their usage.
+                [
+                  $._contextual_simple_identifier,
+                  $._modifierless_class_declaration
+                ],
+                [
+                  $._contextual_simple_identifier, $.property_behavior_modifier
+                ],
+                [ $._contextual_simple_identifier, $.parameter_modifier ],
+                [ $._contextual_simple_identifier, $.type_parameter_pack ],
+                [ $._contextual_simple_identifier, $.type_pack_expansion ],
+                [ $._contextual_simple_identifier, $.visibility_modifier ],
+],
+  extras : ($) => [$.comment,
+                   $.multiline_comment,
+                   /\s+/, // Whitespace
+],
+  externals : ($) => [
+                // Comments and raw strings are parsed in a custom scanner
+                // because they require us to carry forward state to maintain
+                // symmetry. For instance, parsing a multiline comment requires
+                // us to increment a counter whenever we see
+                // `/*`, and decrement it whenever we see `*/`. A standard
+                // grammar would only be able to exit the comment at the first
+                // `*/` (like C does). Similarly, when you start a string with
+                // `##"`, you're required to include the same number of `#`
+                // symbols to end it.
+                $.multiline_comment,
+                $.raw_str_part,
+                $.raw_str_continuing_indicator,
+                $.raw_str_end_part,
+                // Because Swift doesn't have explicit semicolons, we also do
+                // some whitespace handling in a custom scanner. Line breaks 
are
+                // _sometimes_ meaningful as the end of a statement: try to
+                // write `let foo: Foo let bar: Bar`, for instance and the
+                // compiler will complain, but add either a newline or a
+                // semicolon and it's fine. We borrow the idea from the Kotlin
+                // grammar that a newline is sometimes a "semicolon". By
+                // including `\n` in both `_semi` and an anonymous `whitespace`
+                // extras, we _should_ be able to let the parser decide if a
+                // newline is meaningful. If the parser sees something like
+                // `foo.bar(1\n)`, it knows that a "semicolon" would not be
+                // valid there, so it parses that as whitespace. On the other
+                // hand, `let foo: Foo\n let bar: Bar` has a meaningful 
newline.
+                // Unfortunately, we can't simply stop at that. There are some
+                // expressions and statements that remain valid if you end them
+                // early, but are expected to be parsed across multiple lines.
+                // One particular nefarious example is a function declaration,
+                // where you might have something like `func foo<A>(args: A) ->
+                // Foo throws where A: Hashable`. This would still be a valid
+                // declaration even if it ended after the `)`, the `Foo`, or 
the
+                // `throws`, so a grammar that simply interprets a newline as
+                // "sometimes a semi" would parse those incorrectly. To solve
+                // that case, our custom scanner must do a bit of extra
+                // lookahead itself. If we're about to generate a
+                // `_semi`, we advance a bit further to see if the next
+                // non-whitespace token would be one of these other operators.
+                // If so, we ignore the `_semi` and just produce the operator;
+                // if not, we produce the `_semi` and let the rest of the
+                // grammar sort it out. This isn't perfect, but it works well
+                // enough most of the time.
+                $._implicit_semi,
+                $._explicit_semi,
+                // Every one of the below operators will suppress a `_semi` if
+                // we encounter it after a newline.
+                $._arrow_operator_custom,
+                $._dot_custom,
+                $._conjunction_operator_custom,
+                $._disjunction_operator_custom,
+                $._nil_coalescing_operator_custom,
+                $._eq_custom,
+                $._eq_eq_custom,
+                $._plus_then_ws,
+                $._minus_then_ws,
+                $._bang_custom,
+                $._throws_keyword,
+                $._rethrows_keyword,
+                $.default_keyword,
+                $.where_keyword,
+                $["else"],
+                $.catch_keyword,
+                $._as_custom,
+                $._as_quest_custom,
+                $._as_bang_custom,
+                $._async_keyword_custom,
+                $._custom_operator,
+                $._hash_symbol_custom,
+                $._directive_if,
+                $._directive_elseif,
+                $._directive_else,
+                $._directive_endif,
+
+                // Fake operator that will never get triggered, but follows the
+                // sequence of characters for `try!`. Tracked by the custom
+                // scanner so that it can avoid triggering `$.bang` for that
+                // case.
+                $._fake_try_bang,
+],
+  inline : ($) => [$._locally_permitted_modifiers],
+  rules : {
+    ////////////////////////////////
+    // File Structure
+    ////////////////////////////////
+    source_file : ($) => seq(
+                    optional($.shebang_line),
+                    optional(seq($._top_level_statement,
+                                 repeat(seq($._semi, $._top_level_statement)),
+                                 optional($._semi)))),
+    _semi : ($) => choice($._implicit_semi, $._explicit_semi),
+    shebang_line : ($) => seq($._hash_symbol, "!", /[^\r\n]*/),
+    ////////////////////////////////
+    // Lexical Structure -
+    // https://docs.swift.org/swift-book/ReferenceManual/LexicalStructure.html
+    ////////////////////////////////
+    comment : ($) => token(prec(PRECS.comment, seq("//", /.*/))),
+    // Identifiers
+    simple_identifier : ($) =>
+                          choice(LEXICAL_IDENTIFIER, /`[^\r\n` ]*`/, 
/\$[0-9]+/,
+                                 token(seq("$", LEXICAL_IDENTIFIER)),
+                                 $._contextual_simple_identifier),
+    // Keywords that were added after they were already legal as identifiers.
+    // `tree-sitter` will prefer exact matches
+    // when parsing so unless we explicitly say that these are legal, the 
parser
+    // will interpret them as their keyword.
+    _contextual_simple_identifier : ($) =>
+                                      choice("actor", "async", "each", "lazy",
+                                             "repeat", "package",
+                                             $._parameter_ownership_modifier),
+    identifier : ($) => sep1($.simple_identifier, $._dot),
+    // Literals
+    _basic_literal : ($) =>
+                       choice($.integer_literal, $.hex_literal, $.oct_literal,
+                              $.bin_literal, $.real_literal, $.boolean_literal,
+                              $._string_literal, $.regex_literal, "nil"),
+    real_literal : ($) => token(choice(seq(DEC_DIGITS, REAL_EXPONENT),
+                                       seq(optional(DEC_DIGITS), ".",
+                                           DEC_DIGITS, 
optional(REAL_EXPONENT)),
+                                       seq("0x", HEX_DIGITS,
+                                           optional(seq(".", HEX_DIGITS)),
+                                           HEX_REAL_EXPONENT))),
+    integer_literal : ($) => token(seq(optional(/[1-9]/), DEC_DIGITS)),
+    hex_literal : ($) => token(seq("0", /[xX]/, HEX_DIGITS)),
+    oct_literal : ($) => token(seq("0", /[oO]/, OCT_DIGITS)),
+    bin_literal : ($) => token(seq("0", /[bB]/, BIN_DIGITS)),
+    boolean_literal : ($) => choice("true", "false"),
+    // String literals
+    _string_literal : ($) => choice($.line_string_literal,
+                                    $.multi_line_string_literal,
+                                    $.raw_string_literal),
+    line_string_literal : ($) => seq(
+                            '"',
+                            repeat(choice(field("text", 
$._line_string_content),
+                                          $._interpolation)),
+                            '"'),
+    _line_string_content : ($) => choice($.line_str_text, $.str_escaped_char),
+    line_str_text : ($) => /[^\\"]+/,
+    str_escaped_char : ($) => choice($._escaped_identifier,
+                                     $._uni_character_literal),
+    _uni_character_literal : ($) => seq("\\", "u", /\{[0-9a-fA-F]+\}/),
+    multi_line_string_literal : ($) =>
+                                  seq('"""',
+                                      repeat(choice(
+                                          field("text",
+                                                $._multi_line_string_content),
+                                          $._interpolation)),
+                                      '"""'),
+    raw_string_literal : ($) => seq(
+                           repeat(seq(
+                               field("text", $.raw_str_part),
+                               field("interpolation", $.raw_str_interpolation),
+                               optional($.raw_str_continuing_indicator))),
+                           field("text", $.raw_str_end_part)),
+    raw_str_interpolation : ($) => seq($.raw_str_interpolation_start,
+                                       $._interpolation_contents, ")"),
+    raw_str_interpolation_start : ($) => /\\#*\(/,
+    _multi_line_string_content : ($) => choice($.multi_line_str_text,
+                                               $.str_escaped_char, '"'),
+    _interpolation : ($) => seq("\\(", $._interpolation_contents, ")"),
+    _interpolation_contents : ($) =>
+                                sep1Opt(field("interpolation",
+                                              alias($.value_argument,
+                                                    
$.interpolated_expression)),
+                                        ","),
+    _escaped_identifier : ($) => /\\[0\\tnr"'\n]/,
+    multi_line_str_text : ($) => /[^\\"]+/,
+    // Based on
+    // 
https://gitlab.com/woolsweater/tree-sitter-swifter/-/blob/3d47c85bd47ce54cdf2023a9c0e01eb90adfcc1d/grammar.js#L1019
----------------
JDevlieghere wrote:


I think this is likely already best place to do the attribution.

https://github.com/llvm/llvm-project/pull/181297
_______________________________________________
lldb-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/lldb-commits

[Lldb-commits] [lldb] [lldb] Add tree-sitter based Swift syntax highlighting (PR #181297)

Reply via email to