On Thu, Dec 14, 2017 at 8:15 AM, Richard Smith via cfe-commits <cfe-commits@lists.llvm.org> wrote: > Author: rsmith > Date: Thu Dec 14 05:15:08 2017 > New Revision: 320697 > > URL: http://llvm.org/viewvc/llvm-project?rev=320697&view=rev > Log: > Warn if we find a Unicode homoglyph for a symbol in an identifier. > > Specifically, warn if: > * we find a character that the language standard says we must treat as an > identifier, and > * that character is not reasonably an identifier character (it's a > punctuation > character or similar), and > * it renders identically to a valid non-identifier character in common > fixed-width fonts. > > Some tools "helpfully" substitute the surprising characters for the expected > characters, and replacing semicolons with Greek question marks is a common > "prank". > > Modified: > cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td > cfe/trunk/lib/Lex/Lexer.cpp > cfe/trunk/test/Lexer/unicode.c > > Modified: cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td > URL: > http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td?rev=320697&r1=320696&r2=320697&view=diff > ============================================================================== > --- cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td (original) > +++ cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td Thu Dec 14 05:15:08 > 2017 > @@ -119,6 +119,9 @@ def err_non_ascii : Error< > def ext_unicode_whitespace : ExtWarn< > "treating Unicode character as whitespace">, > InGroup<DiagGroup<"unicode-whitespace">>; > +def warn_utf8_symbol_homoglyph : Warning< > + "treating Unicode character <U+%0> as identifier character rather than " > + "as '%1' symbol">, InGroup<DiagGroup<"unicode-homoglyph">>;
Can this wording be tweaked slightly to "as an identifier character" or does that cause too much of an "a/an" problem with "as %1 symbol"? ~Aaron > > def err_hex_escape_no_digits : Error< > "\\%0 used with no following hex digits">; > > Modified: cfe/trunk/lib/Lex/Lexer.cpp > URL: > http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Lex/Lexer.cpp?rev=320697&r1=320696&r2=320697&view=diff > ============================================================================== > --- cfe/trunk/lib/Lex/Lexer.cpp (original) > +++ cfe/trunk/lib/Lex/Lexer.cpp Thu Dec 14 05:15:08 2017 > @@ -37,6 +37,7 @@ > #include "llvm/Support/ConvertUTF.h" > #include "llvm/Support/MathExtras.h" > #include "llvm/Support/MemoryBuffer.h" > +#include "llvm/Support/NativeFormatting.h" > #include "llvm/Support/UnicodeCharRanges.h" > #include <algorithm> > #include <cassert> > @@ -1500,6 +1501,75 @@ static void maybeDiagnoseIDCharCompat(Di > } > } > > +/// After encountering UTF-8 character C and interpreting it as an identifier > +/// character, check whether it's a homoglyph for a common non-identifier > +/// source character that is unlikely to be an intentional identifier > +/// character and warn if so. > +static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C, > + CharSourceRange Range) { > + // FIXME: Handle Unicode quotation marks (smart quotes, fullwidth quotes). > + struct HomoglyphPair { > + uint32_t Character; > + char LooksLike; > + bool operator<(HomoglyphPair R) const { return Character < R.Character; } > + }; > + static constexpr HomoglyphPair SortedHomoglyphs[] = { > + {U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK > + {U'\u037e', ';'}, // GREEK QUESTION MARK > + {U'\u2212', '-'}, // MINUS SIGN > + {U'\u2215', '/'}, // DIVISION SLASH > + {U'\u2216', '\\'}, // SET MINUS > + {U'\u2217', '*'}, // ASTERISK OPERATOR > + {U'\u2223', '|'}, // DIVIDES > + {U'\u2227', '^'}, // LOGICAL AND > + {U'\u2236', ':'}, // RATIO > + {U'\u223c', '~'}, // TILDE OPERATOR > + {U'\ua789', ':'}, // MODIFIER LETTER COLON > + {U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK > + {U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN > + {U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN > + {U'\uff05', '%'}, // FULLWIDTH PERCENT SIGN > + {U'\uff06', '&'}, // FULLWIDTH AMPERSAND > + {U'\uff08', '('}, // FULLWIDTH LEFT PARENTHESIS > + {U'\uff09', ')'}, // FULLWIDTH RIGHT PARENTHESIS > + {U'\uff0a', '*'}, // FULLWIDTH ASTERISK > + {U'\uff0b', '+'}, // FULLWIDTH ASTERISK > + {U'\uff0c', ','}, // FULLWIDTH COMMA > + {U'\uff0d', '-'}, // FULLWIDTH HYPHEN-MINUS > + {U'\uff0e', '.'}, // FULLWIDTH FULL STOP > + {U'\uff0f', '/'}, // FULLWIDTH SOLIDUS > + {U'\uff1a', ':'}, // FULLWIDTH COLON > + {U'\uff1b', ';'}, // FULLWIDTH SEMICOLON > + {U'\uff1c', '<'}, // FULLWIDTH LESS-THAN SIGN > + {U'\uff1d', '='}, // FULLWIDTH EQUALS SIGN > + {U'\uff1e', '>'}, // FULLWIDTH GREATER-THAN SIGN > + {U'\uff1f', '?'}, // FULLWIDTH QUESTION MARK > + {U'\uff20', '@'}, // FULLWIDTH COMMERCIAL AT > + {U'\uff3b', '['}, // FULLWIDTH LEFT SQUARE BRACKET > + {U'\uff3c', '\\'}, // FULLWIDTH REVERSE SOLIDUS > + {U'\uff3d', ']'}, // FULLWIDTH RIGHT SQUARE BRACKET > + {U'\uff3e', '^'}, // FULLWIDTH CIRCUMFLEX ACCENT > + {U'\uff5b', '{'}, // FULLWIDTH LEFT CURLY BRACKET > + {U'\uff5c', '|'}, // FULLWIDTH VERTICAL LINE > + {U'\uff5d', '}'}, // FULLWIDTH RIGHT CURLY BRACKET > + {U'\uff5e', '~'}, // FULLWIDTH TILDE > + {0, 0} > + }; > + auto Homoglyph = > + std::lower_bound(std::begin(SortedHomoglyphs), > + std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, > '\0'}); > + if (Homoglyph->Character == C) { > + llvm::SmallString<5> CharBuf; > + { > + llvm::raw_svector_ostream CharOS(CharBuf); > + llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4); > + } > + const char LooksLikeStr[] = {Homoglyph->LooksLike, 0}; > + Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph) > + << Range << CharBuf << LooksLikeStr; > + } > +} > + > bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size, > Token &Result) { > const char *UCNPtr = CurPtr + Size; > @@ -1534,10 +1604,13 @@ bool Lexer::tryConsumeIdentifierUTF8Char > !isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts)) > return false; > > - if (!isLexingRawMode()) > + if (!isLexingRawMode()) { > maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint, > makeCharRange(*this, CurPtr, UnicodePtr), > /*IsFirst=*/false); > + maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint, > + makeCharRange(*this, CurPtr, UnicodePtr)); > + } > > CurPtr = UnicodePtr; > return true; > @@ -3737,6 +3810,7 @@ LexNextToken: > // We can't just reset CurPtr to BufferPtr because BufferPtr may point to > // an escaped newline. > --CurPtr; > + const char *UTF8StartPtr = CurPtr; > llvm::ConversionResult Status = > llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr, > (const llvm::UTF8 *)BufferEnd, > @@ -3751,6 +3825,9 @@ LexNextToken: > // (We manually eliminate the tail call to avoid recursion.) > goto LexNextToken; > } > + if (!isLexingRawMode()) > + maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint, > + makeCharRange(*this, UTF8StartPtr, > CurPtr)); > return LexUnicode(Result, CodePoint, CurPtr); > } > > > Modified: cfe/trunk/test/Lexer/unicode.c > URL: > http://llvm.org/viewvc/llvm-project/cfe/trunk/test/Lexer/unicode.c?rev=320697&r1=320696&r2=320697&view=diff > ============================================================================== > --- cfe/trunk/test/Lexer/unicode.c (original) > +++ cfe/trunk/test/Lexer/unicode.c Thu Dec 14 05:15:08 2017 > @@ -33,3 +33,8 @@ int main () { > int 🌷 = 🌵(🌹); > return 🌷; > } > + > +int n; = 3; // expected-warning {{treating Unicode character <U+037E> as > identifier character rather than as ';' symbol}} > +int *n꞉꞉v = &n;; // expected-warning 2{{treating Unicode character > <U+A789> as identifier character rather than as ':' symbol}} > + // expected-warning@-1 {{treating Unicode character > <U+037E> as identifier character rather than as ';' symbol}} > +int vï¼ ï¼»ï¼ ï¼½ï¼ˆauto){return~xï¼›ï½ ï¼ˆï¼‰; // expected-warning > 12{{treating Unicode character}} > > > _______________________________________________ > cfe-commits mailing list > cfe-commits@lists.llvm.org > http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits