[PATCH] D159345: [Clang] Handle non-ASCII after line splicing

Corentin Jabot via Phabricator via cfe-commits Fri, 01 Sep 2023 23:12:58 -0700

cor3ntin updated this revision to Diff 555563.
cor3ntin marked an inline comment as done.
cor3ntin added a comment.


Add and fix comments


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D159345/new/

https://reviews.llvm.org/D159345

Files:
  clang/docs/ReleaseNotes.rst
  clang/include/clang/Lex/Lexer.h
  clang/lib/Lex/Lexer.cpp
  clang/test/Lexer/escape_newline_unicode.c

Index: clang/test/Lexer/escape_newline_unicode.c
===================================================================
--- /dev/null
+++ clang/test/Lexer/escape_newline_unicode.c
@@ -0,0 +1,38 @@
+// RUN: %clang_cc1 -verify=expected,c -x c -Wunused %s
+// RUN: %clang_cc1 -verify=expected,cpp -x c++ -Wunused %s
+
+void gh65156(void) {
+
+int a\
+ã¹ = 42;
+// expected-warning@-2 {{unused variable 'aã¹'}}
+
+int b\ 
+\ 
+ã¹ = 42;
+// expected-warning@-2 {{backslash and newline separated by space}}
+// expected-warning@-4 {{backslash and newline separated by space}}
+// expected-warning@-5 {{unused variable 'bã¹'}}
+
+int ã¹\
+ã¹ = 42;
+// expected-warning@-2 {{unused variable 'ã¹ã¹'}}
+
+int \
+ã¹ = 42;
+// expected-warning@-2 {{unused variable 'ã¹'}}
+
+}
+
+void gh65156_err(void) {
+
+int \
+â = 0;
+// cpp-error@-2 {{expected unqualified-id}}
+// c-error@-3 {{expected identifier}}
+
+
+int a\
+â = 0;
+// expected-error@-1 {{character <U+274C> not allowed in an identifier}}
+}
Index: clang/lib/Lex/Lexer.cpp
===================================================================
--- clang/lib/Lex/Lexer.cpp
+++ clang/lib/Lex/Lexer.cpp
@@ -1750,9 +1750,17 @@
   return true;
 }
 
-bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) {
-  const char *UnicodePtr = CurPtr;
+bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr, Token &Tok) {
   llvm::UTF32 CodePoint;
+
+  // If a UTF-8 codepoint appears immediately after an escaped new line,
+  // CurPtr may point to the splicing \ on the preceding line,
+  // so we need to skip it.
+  unsigned FirstCodeUnitSize;
+  getCharAndSize(CurPtr, FirstCodeUnitSize);
+  const char *CharStart = CurPtr + FirstCodeUnitSize - 1;
+  const char *UnicodePtr = CharStart;
+
   llvm::ConversionResult Result =
       llvm::convertUTF8Sequence((const llvm::UTF8 **)&UnicodePtr,
                                 (const llvm::UTF8 *)BufferEnd,
@@ -1771,21 +1779,26 @@
         !PP->isPreprocessedOutput())
       diagnoseInvalidUnicodeCodepointInIdentifier(
           PP->getDiagnostics(), LangOpts, CodePoint,
-          makeCharRange(*this, CurPtr, UnicodePtr), /*IsFirst=*/false);
+          makeCharRange(*this, CharStart, UnicodePtr), /*IsFirst=*/false);
     // We got a unicode codepoint that is neither a space nor a
     // a valid identifier part. Carry on as if the codepoint was
     // valid for recovery purposes.
   } else if (!isLexingRawMode()) {
     if (IsExtension)
-      diagnoseExtensionInIdentifier(PP->getDiagnostics(), CodePoint,
-                                    makeCharRange(*this, CurPtr, UnicodePtr));
+      diagnoseExtensionInIdentifier(
+          PP->getDiagnostics(), CodePoint,
+          makeCharRange(*this, CharStart, UnicodePtr));
     maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
-                              makeCharRange(*this, CurPtr, UnicodePtr),
+                              makeCharRange(*this, CharStart, UnicodePtr),
                               /*IsFirst=*/false);
     maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint,
-                               makeCharRange(*this, CurPtr, UnicodePtr));
+                               makeCharRange(*this, CharStart, UnicodePtr));
   }
 
+  // Once we sucessfully parsed some UTF-8,
+  // calling ConsumeChar ensures the NeedsCleaning flag is set on the token
+  // being lexed, and that warnings about trailing spaces are emmitted.
+  ConsumeChar(CurPtr, FirstCodeUnitSize, Tok);
   CurPtr = UnicodePtr;
   return true;
 }
@@ -1865,7 +1878,7 @@
     }
     if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
       continue;
-    if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
+    if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
       continue;
     // Neither an expected Unicode codepoint nor a UCN.
     break;
@@ -1985,7 +1998,7 @@
   // If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue.
   if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
     return LexNumericConstant(Result, CurPtr);
-  if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
+  if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
     return LexNumericConstant(Result, CurPtr);
 
   // Update the location of token as well as BufferPtr.
@@ -2009,7 +2022,7 @@
   if (!isAsciiIdentifierStart(C)) {
     if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
       Consumed = true;
-    else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
+    else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
       Consumed = true;
     else
       return CurPtr;
@@ -2079,7 +2092,7 @@
     if (isAsciiIdentifierContinue(C)) {
       CurPtr = ConsumeChar(CurPtr, Size, Result);
     } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {
-    } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) {
+    } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result)) {
     } else
       break;
   }
Index: clang/include/clang/Lex/Lexer.h
===================================================================
--- clang/include/clang/Lex/Lexer.h
+++ clang/include/clang/Lex/Lexer.h
@@ -807,7 +807,7 @@
   ///        sequence. On success, updated to point past the end of it.
   /// \return \c true if a UTF-8 sequence mapping to an acceptable identifier
   ///         character was lexed, \c false otherwise.
-  bool tryConsumeIdentifierUTF8Char(const char *&CurPtr);
+  bool tryConsumeIdentifierUTF8Char(const char *&CurPtr, Token &Tok);
 };
 
 } // namespace clang
Index: clang/docs/ReleaseNotes.rst
===================================================================
--- clang/docs/ReleaseNotes.rst
+++ clang/docs/ReleaseNotes.rst
@@ -205,6 +205,8 @@
 - Clang's ``-Wunused-private-field`` no longer warns on fields whose type is
   declared with ``[[maybe_unused]]``.
   (`#61334 <https://github.com/llvm/llvm-project/issues/61334>`_)
+- Correctly parse non-ascii identifiers that appear immediately after a line splicing
+  (`#65156 <https://github.com/llvm/llvm-project/issues/65156>`_`)
 
 Bug Fixes to Compiler Builtins
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D159345: [Clang] Handle non-ASCII after line splicing

Reply via email to