Hi rsmith,

This is a missing piece for C99 conformance.

This patch handles UCNs by adding a '\\' case to LexTokenInternal and 
LexIdentifier -- if we see a backslash, we tentatively try to read in a UCN. If 
the UCN is not syntactically well-formed, we fall back to the old treatment: a 
backslash followed by an identifier beginning with 'u' (or 'U').

Because the spelling of an identifier with UCNs still has the UCN in it, we 
need to convert that to UTF-8 in Preprocessor::LookUpIdentifierInfo.

Of course, valid code that does //not// use UCNs will see only a very minimal 
performance hit (checks after each identifier for non-ASCII characters, checks 
when converting raw_identifiers to identifiers that they do not contain UCNs, 
and checks when getting the spelling of an identifier that it does not contain 
a UCN).

This patch also adds basic support for actual UTF-8 in the source, including 
treating Unicode whitespace as whitespace.

http://llvm-reviews.chandlerc.com/D312

Files:
  include/clang/Basic/ConvertUTF.h
  include/clang/Basic/DiagnosticLexKinds.td
  include/clang/Lex/Lexer.h
  include/clang/Lex/Token.h
  lib/Lex/Lexer.cpp
  lib/Lex/LiteralSupport.cpp
  lib/Lex/Preprocessor.cpp
  test/CXX/over/over.oper/over.literal/p8.cpp
  test/FixIt/fixit-unicode.c
  test/Preprocessor/ucn-pp-identifier.c
  test/Sema/ucn-cstring.c
Index: include/clang/Basic/ConvertUTF.h
===================================================================
--- include/clang/Basic/ConvertUTF.h
+++ include/clang/Basic/ConvertUTF.h
@@ -161,6 +161,16 @@
 
 unsigned getNumBytesForUTF8(UTF8 firstByte);
 
+static inline ConversionResult convertUTF8Sequence(const UTF8 **source,
+                                                   const UTF8 *sourceEnd,
+                                                   UTF32 *target,
+                                                   ConversionFlags flags) {
+  unsigned size = getNumBytesForUTF8(**source);
+  if (size > sourceEnd - *source)
+    return sourceExhausted;
+  return ConvertUTF8toUTF32(source, *source + size, &target, target + 1, flags);
+}
+
 #ifdef __cplusplus
 }
 
Index: include/clang/Basic/DiagnosticLexKinds.td
===================================================================
--- include/clang/Basic/DiagnosticLexKinds.td
+++ include/clang/Basic/DiagnosticLexKinds.td
@@ -93,16 +93,30 @@
   "multi-character character constant">, InGroup<MultiChar>;
 def ext_four_char_character_literal : Extension<
   "multi-character character constant">, InGroup<FourByteMultiChar>;
-  
 
-// Literal
-def ext_nonstandard_escape : Extension<
-  "use of non-standard escape character '\\%0'">;
-def ext_unknown_escape : ExtWarn<"unknown escape sequence '\\%0'">;
-def err_hex_escape_no_digits : Error<"\\x used with no following hex digits">;
-def err_ucn_escape_no_digits : Error<"\\u used with no following hex digits">;
+
+// Unicode and UCNs
+def err_invalid_utf8 : Error<
+  "source file is not valid UTF-8">;
+def err_non_ascii : Error<
+  "non-ASCII characters are not allowed outside of literals and identifiers">;
+def ext_unicode_whitespace : ExtWarn<
+  "treating Unicode character as whitespace">,
+  InGroup<DiagGroup<"unicode-whitespace">>;
+
+def err_hex_escape_no_digits : Error<
+  "\\%0 used with no following hex digits">;
+def warn_ucn_escape_no_digits : Warning<
+  "\\%0 used with no following hex digits; "
+  "treating as '\\' followed by identifier">, InGroup<Unicode>;
+def err_ucn_escape_incomplete : Error<
+  "incomplete universal character name">;
+def warn_ucn_escape_incomplete : Warning<
+  "incomplete universal character name; "
+  "treating as '\\' followed by identifier">, InGroup<Unicode>;
+def note_ucn_four_not_eight : Note<"did you mean to use '\\u'?">;
 def err_ucn_escape_invalid : Error<"invalid universal character">;
-def err_ucn_escape_incomplete : Error<"incomplete universal character name">;
+
 def err_ucn_escape_basic_scs : Error<
   "character '%0' cannot be specified by a universal character name">;
 def err_ucn_control_character : Error<
@@ -113,6 +127,12 @@
 def warn_cxx98_compat_literal_ucn_control_character : Warning<
   "universal character name referring to a control character "
   "is incompatible with C++98">, InGroup<CXX98Compat>, DefaultIgnore;
+
+
+// Literal
+def ext_nonstandard_escape : Extension<
+  "use of non-standard escape character '\\%0'">;
+def ext_unknown_escape : ExtWarn<"unknown escape sequence '\\%0'">;
 def err_invalid_decimal_digit : Error<"invalid digit '%0' in decimal constant">;
 def err_invalid_binary_digit : Error<"invalid digit '%0' in binary constant">;
 def err_invalid_octal_digit : Error<"invalid digit '%0' in octal constant">;
Index: include/clang/Lex/Lexer.h
===================================================================
--- include/clang/Lex/Lexer.h
+++ include/clang/Lex/Lexer.h
@@ -437,6 +437,11 @@
   ///
   void LexTokenInternal(Token &Result);
 
+  /// Given that a token begins with the Unicode character \p C, figure out
+  /// what kind of token it is and dispatch to the appropriate lexing helper
+  /// function.
+  void LexUnicode(Token &Result, uint32_t C, const char *CurPtr);
+
   /// FormTokenWithChars - When we lex a token, we have identified a span
   /// starting at BufferPtr, going to TokEnd that forms the token.  This method
   /// takes that range and assigns it to the token as its location and size.  In
@@ -579,6 +584,21 @@
   void cutOffLexing() { BufferPtr = BufferEnd; }
 
   bool isHexaLiteral(const char *Start, const LangOptions &LangOpts);
+
+
+  /// Read a universal character name.
+  ///
+  /// \param CurPtr The position in the source buffer after the initial '\'.
+  ///               If the UCN is syntactically well-formed (but not necessarily
+  ///               valid), this parameter will be updated to point to the
+  ///               character after the UCN.
+  /// \param SlashLoc The position in the source buffer of the '\'.
+  /// \param Tok The token being formed. Pass \c NULL to suppress diagnostics
+  ///            and handle token formation in the caller.
+  ///
+  /// \return The Unicode codepoint specified by the UCN, or 0 if the UCN is
+  ///         invalid.
+  uint32_t tryReadUCN(const char *&CurPtr, const char *SlashLoc, Token *Tok);
 };
 
 
Index: include/clang/Lex/Token.h
===================================================================
--- include/clang/Lex/Token.h
+++ include/clang/Lex/Token.h
@@ -74,9 +74,10 @@
     StartOfLine   = 0x01,  // At start of line or only after whitespace.
     LeadingSpace  = 0x02,  // Whitespace exists before this token.
     DisableExpand = 0x04,  // This identifier may never be macro expanded.
-    NeedsCleaning = 0x08,   // Contained an escaped newline or trigraph.
+    NeedsCleaning = 0x08,  // Contained an escaped newline or trigraph.
     LeadingEmptyMacro = 0x10, // Empty macro exists before this token.
-    HasUDSuffix = 0x20     // This string or character literal has a ud-suffix.
+    HasUDSuffix = 0x20,    // This string or character literal has a ud-suffix.
+    HasUCN = 0x40          // This identifier contains a UCN.
   };
 
   tok::TokenKind getKind() const { return (tok::TokenKind)Kind; }
@@ -257,6 +258,9 @@
   /// \brief Return true if this token is a string or character literal which
   /// has a ud-suffix.
   bool hasUDSuffix() const { return (Flags & HasUDSuffix) ? true : false; }
+
+  /// Returns true if this token contains a universal character name.
+  bool hasUCN() const { return (Flags & HasUCN) ? true : false; }
 };
 
 /// \brief Information about the conditional stack (\#if directives)
Index: lib/Lex/Lexer.cpp
===================================================================
--- lib/Lex/Lexer.cpp
+++ lib/Lex/Lexer.cpp
@@ -25,11 +25,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/Lex/Lexer.h"
+#include "clang/Basic/ConvertUTF.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/Lex/CodeCompletionHandler.h"
 #include "clang/Lex/LexDiagnostic.h"
 #include "clang/Lex/Preprocessor.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/MemoryBuffer.h"
@@ -371,10 +373,12 @@
   // NOTE: this has to be checked *before* testing for an IdentifierInfo.
   if (Tok.is(tok::raw_identifier))
     TokStart = Tok.getRawIdentifierData();
-  else if (const IdentifierInfo *II = Tok.getIdentifierInfo()) {
-    // Just return the string from the identifier table, which is very quick.
-    Buffer = II->getNameStart();
-    return II->getLength();
+  else if (!Tok.hasUCN()) {
+    if (const IdentifierInfo *II = Tok.getIdentifierInfo()) {
+      // Just return the string from the identifier table, which is very quick.
+      Buffer = II->getNameStart();
+      return II->getLength();
+    }
   }
 
   // NOTE: this can be checked even after testing for an IdentifierInfo.
@@ -1376,7 +1380,6 @@
 ///   2. If this is an escaped newline (potentially with whitespace between
 ///      the backslash and newline), implicitly skip the newline and return
 ///      the char after it.
-///   3. If this is a UCN, return it.  FIXME: C++ UCN's?
 ///
 /// This handles the slow/uncommon case of the getCharAndSize method.  Here we
 /// know that we can accumulate into Size, and that we have already incremented
@@ -1509,6 +1512,73 @@
   IsAtStartOfLine = StartOfLine;
 }
 
+namespace {
+  struct UCNCharRange {
+    uint32_t Lower;
+    uint32_t Upper;
+  };
+  
+  // C11 D.1, C++11 [charname.allowed]
+  // FIXME: C99 and C++03 each have a different set of allowed UCNs.
+  const UCNCharRange UCNAllowedCharRanges[] = {
+    // 1
+    { 0x00A8, 0x00A8 }, { 0x00AA, 0x00AA }, { 0x00AD, 0x00AD },
+    { 0x00AF, 0x00AF }, { 0x00B2, 0x00B5 }, { 0x00B7, 0x00BA },
+    { 0x00BC, 0x00BE }, { 0x00C0, 0x00D6 }, { 0x00D8, 0x00F6 },
+    { 0x00F8, 0x00FF },
+    // 2
+    { 0x0100, 0x167F }, { 0x1681, 0x180D }, { 0x180F, 0x1FFF },
+    // 3
+    { 0x200B, 0x200D }, { 0x202A, 0x202E }, { 0x203F, 0x2040 },
+    { 0x2054, 0x2054 }, { 0x2060, 0x206F },
+    // 4
+    { 0x2070, 0x218F }, { 0x2460, 0x24FF }, { 0x2776, 0x2793 },
+    { 0x2C00, 0x2DFF }, { 0x2E80, 0x2FFF },
+    // 5
+    { 0x3004, 0x3007 }, { 0x3021, 0x302F }, { 0x3031, 0x303F },
+    // 6
+    { 0x3040, 0xD7FF },
+    // 7
+    { 0xF900, 0xFD3D }, { 0xFD40, 0xFDCF }, { 0xFDF0, 0xFE44 },
+    { 0xFE47, 0xFFFD },
+    // 8
+    { 0x10000, 0x1FFFD }, { 0x20000, 0x2FFFD }, { 0x30000, 0x3FFFD },
+    { 0x40000, 0x4FFFD }, { 0x50000, 0x5FFFD }, { 0x60000, 0x6FFFD },
+    { 0x70000, 0x7FFFD }, { 0x80000, 0x8FFFD }, { 0x90000, 0x9FFFD },
+    { 0xA0000, 0xAFFFD }, { 0xB0000, 0xBFFFD }, { 0xC0000, 0xCFFFD },
+    { 0xD0000, 0xDFFFD }, { 0xE0000, 0xEFFFD }
+  };
+}
+
+static bool isAllowedIDChar(uint32_t c) {
+  unsigned LowPoint = 0;
+  unsigned HighPoint = llvm::array_lengthof(UCNAllowedCharRanges);
+
+  // Binary search the UCNAllowedCharRanges set.
+  while (HighPoint != LowPoint) {
+    unsigned MidPoint = (HighPoint + LowPoint) / 2;
+    if (c < UCNAllowedCharRanges[MidPoint].Lower)
+      HighPoint = MidPoint;
+    else if (c > UCNAllowedCharRanges[MidPoint].Upper)
+      LowPoint = MidPoint + 1;
+    else
+      return true;
+  }
+
+  return false;
+}
+
+static bool isAllowedInitiallyIDChar(uint32_t c) {
+  // C11 D.2, C++11 [charname.disallowed]
+  // FIXME: C99 only forbids "digits", presumably as described in C99 Annex D.
+  // FIXME: C++03 does not forbid any initial characters.
+  return !(0x0300 <= c && c <= 0x036F) &&
+         !(0x1DC0 <= c && c <= 0x1DFF) &&
+         !(0x20D0 <= c && c <= 0x20FF) &&
+         !(0xFE20 <= c && c <= 0xFE2F);
+}
+
+
 void Lexer::LexIdentifier(Token &Result, const char *CurPtr) {
   // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$]
   unsigned Size;
@@ -1520,11 +1590,11 @@
 
   // Fast path, no $,\,? in identifier found.  '\' might be an escaped newline
   // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN.
-  // FIXME: UCNs.
   //
   // TODO: Could merge these checks into a CharInfo flag to make the comparison
   // cheaper
-  if (C != '\\' && C != '?' && (C != '$' || !LangOpts.DollarIdents)) {
+  if (isascii(C) && C != '\\' && C != '?' &&
+      (C != '$' || !LangOpts.DollarIdents)) {
 FinishIdentifier:
     const char *IdStart = BufferPtr;
     FormTokenWithChars(Result, CurPtr, tok::raw_identifier);
@@ -1561,8 +1631,34 @@
       CurPtr = ConsumeChar(CurPtr, Size, Result);
       C = getCharAndSize(CurPtr, Size);
       continue;
-    } else if (!isIdentifierBody(C)) { // FIXME: UCNs.
-      // Found end of identifier.
+
+    } else if (C == '\\') {
+      const char *UCNPtr = CurPtr + Size;
+      uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/0);
+      if (CodePoint == 0 || !isAllowedIDChar(CodePoint))
+        goto FinishIdentifier;
+
+      Result.setFlag(Token::HasUCN);
+      while (CurPtr != UCNPtr)
+        (void)getAndAdvanceChar(CurPtr, Result);
+
+      C = getCharAndSize(CurPtr, Size);
+      continue;
+    } else if (!isascii(C)) {
+      const char *UnicodePtr = CurPtr;
+      UTF32 CodePoint;
+      ConversionResult Result = convertUTF8Sequence((const UTF8 **)&UnicodePtr,
+                                                    (const UTF8 *)BufferEnd,
+                                                    &CodePoint,
+                                                    strictConversion);
+      if (Result != conversionOK ||
+          !isAllowedIDChar(static_cast<uint32_t>(CodePoint)))
+        goto FinishIdentifier;
+
+      CurPtr = UnicodePtr;
+      C = getCharAndSize(CurPtr, Size);
+      continue;
+    } else if (!isIdentifierBody(C)) {
       goto FinishIdentifier;
     }
 
@@ -1570,7 +1666,7 @@
     CurPtr = ConsumeChar(CurPtr, Size, Result);
 
     C = getCharAndSize(CurPtr, Size);
-    while (isIdentifierBody(C)) { // FIXME: UCNs.
+    while (isIdentifierBody(C)) {
       CurPtr = ConsumeChar(CurPtr, Size, Result);
       C = getCharAndSize(CurPtr, Size);
     }
@@ -1595,7 +1691,7 @@
   unsigned Size;
   char C = getCharAndSize(CurPtr, Size);
   char PrevCh = 0;
-  while (isNumberBody(C)) { // FIXME: UCNs in ud-suffix.
+  while (isNumberBody(C)) {
     CurPtr = ConsumeChar(CurPtr, Size, Result);
     PrevCh = C;
     C = getCharAndSize(CurPtr, Size);
@@ -2592,6 +2688,151 @@
   return false;
 }
 
+uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
+                           Token *Result) {
+  assert(LangOpts.CPlusPlus || LangOpts.C99);
+
+  unsigned CharSize;
+  char Kind = getCharAndSize(StartPtr, CharSize);
+
+  unsigned NumHexDigits;
+  if (Kind == 'u')
+    NumHexDigits = 4;
+  else if (Kind == 'U')
+    NumHexDigits = 8;
+  else
+    return 0;
+
+  const char *CurPtr = StartPtr + CharSize;
+  const char *KindLoc = &CurPtr[-1];
+
+  uint32_t CodePoint = 0;
+  for (unsigned i = 0; i < NumHexDigits; ++i) {
+    char C = getCharAndSize(CurPtr, CharSize);
+
+    unsigned Value = llvm::hexDigitValue(C);
+    if (Value == -1U) {
+      if (Result && !isLexingRawMode()) {
+        if (i == 0) {
+          Diag(BufferPtr, diag::warn_ucn_escape_no_digits)
+            << StringRef(KindLoc, 1);
+        } else {
+          Diag(BufferPtr, diag::warn_ucn_escape_incomplete);
+
+          // If the user wrote \U1234, suggest a fixit to \u.
+          if (i == 4 && NumHexDigits == 8) {
+            CharSourceRange URange =
+              CharSourceRange::getCharRange(getSourceLocation(KindLoc),
+                                            getSourceLocation(KindLoc + 1));
+            Diag(KindLoc, diag::note_ucn_four_not_eight)
+              << FixItHint::CreateReplacement(URange, "u");
+          }
+        }
+      }
+      
+      return 0;
+    }
+
+    CodePoint <<= 4;
+    CodePoint += Value;
+
+    CurPtr += CharSize;
+  }
+
+  if (Result) {
+    Result->setFlag(Token::HasUCN);
+    while (StartPtr != CurPtr)
+      getAndAdvanceChar(StartPtr, *Result);
+  } else {
+    StartPtr = CurPtr;
+  }
+
+  // C99 6.4.3p2: A universal character name shall not specify a character whose
+  //   short identifier is less than 00A0 other than 0024 ($), 0040 (@), or
+  //   0060 (‘), nor one in the range D800 through DFFF inclusive.)
+  if (CodePoint < 0xA0) {
+    if (CodePoint == 0x24 || CodePoint == 0x40 || CodePoint == 0x60)
+      return CodePoint;
+
+    if (Result && !isLexingRawMode()) {
+      if (CodePoint < 0x20 || CodePoint >= 0x7F)
+        Diag(BufferPtr, diag::err_ucn_control_character);
+      else {
+        char C = static_cast<char>(CodePoint);
+        Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1);
+      }
+    }
+
+    return 0;
+    
+  } else if ((!LangOpts.CPlusPlus || LangOpts.CPlusPlus11) &&
+             (CodePoint >= 0xD800 && CodePoint <= 0xDFFF)) {
+    // C++03 allows UCNs representing surrogate characters. C99 and C++11 don't.
+    if (Result && !isLexingRawMode())
+      Diag(BufferPtr, diag::err_ucn_escape_invalid);
+    return 0;
+  }
+
+  return CodePoint;
+}
+
+static bool isUnicodeWhitespace(uint32_t C) {
+  return (C == 0x0085 || C == 0x00A0 || C == 0x1680 ||
+          C == 0x180E || (C >= 0x2000 && C <= 0x200A) ||
+          C == 0x2028 || C == 0x2029 || C == 0x202F ||
+          C == 0x205F || C == 0x3000);
+}
+
+void Lexer::LexUnicode(Token &Result, uint32_t C, const char *CurPtr) {
+  if (isUnicodeWhitespace(C)) {
+    if (!isLexingRawMode()) {
+      CharSourceRange CharRange =
+        CharSourceRange::getCharRange(getSourceLocation(),
+                                      getSourceLocation(CurPtr));
+      Diag(BufferPtr, diag::ext_unicode_whitespace)
+        << CharRange;
+    }
+
+    Result.setFlag(Token::LeadingSpace);
+    if (SkipWhitespace(Result, CurPtr))
+      return; // KeepWhitespaceMode
+
+    return LexTokenInternal(Result);
+  }
+
+  if (isAllowedIDChar(C) && isAllowedInitiallyIDChar(C)) {
+    MIOpt.ReadToken();
+    return LexIdentifier(Result, CurPtr);
+  }
+
+  if (!isascii(*BufferPtr) && !isAllowedIDChar(C)) {
+    // Non-ASCII characters tend to creep into source code unintentionally.
+    // Instead of letting the parser complain about the unknown token,
+    // just drop the character.
+    // Note that we can /only/ do this when the non-ASCII character is actually
+    // spelled as Unicode, not written as a UCN. The standard requires that
+    // we not throw away any possible preprocessor tokens, but there's a
+    // loophole in the mapping of Unicode characters to basic character set
+    // characters that allows us to map these particular characters to, say,
+    // whitespace.
+    if (!isLexingRawMode()) {
+      CharSourceRange CharRange =
+        CharSourceRange::getCharRange(getSourceLocation(),
+                                      getSourceLocation(CurPtr));
+      Diag(BufferPtr, diag::err_non_ascii)
+        << FixItHint::CreateRemoval(CharRange);
+    }
+
+    BufferPtr = CurPtr;
+    return LexTokenInternal(Result);
+  }
+
+  // Otherwise, we have an explicit UCN or a character that's unlikely to show
+  // up by accident.
+  MIOpt.ReadToken();
+  FormTokenWithChars(Result, CurPtr, tok::unknown);
+}
+
 
 /// LexTokenInternal - This implements a simple C family lexer.  It is an
 /// extremely performance critical piece of code.  This assumes that the buffer
@@ -3243,12 +3484,41 @@
       Kind = tok::unknown;
     break;
 
+  // UCNs (C99 6.4.3, C++11 [lex.charset]p2)
   case '\\':
-    // FIXME: UCN's.
-    // FALL THROUGH.
-  default:
+    if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result))
+      return LexUnicode(Result, CodePoint, CurPtr);
+
     Kind = tok::unknown;
     break;
+
+  default: {
+    if (isascii(Char)) {
+      Kind = tok::unknown;
+      break;
+    }
+
+    UTF32 CodePoint;
+
+    // We can't just reset CurPtr to BufferPtr because BufferPtr may point to
+    // an escaped newline.
+    --CurPtr;
+    ConversionResult Status = convertUTF8Sequence((const UTF8 **)&CurPtr,
+                                                  (const UTF8 *)BufferEnd,
+                                                  &CodePoint,
+                                                  strictConversion);
+    if (Status == conversionOK)
+      return LexUnicode(Result, CodePoint, CurPtr);
+    
+    // Non-ASCII characters tend to creep into source code unintentionally.
+    // Instead of letting the parser complain about the unknown token,
+    // just warn that we don't have valid UTF-8, then drop the character.
+    if (!isLexingRawMode())
+      Diag(CurPtr, diag::err_invalid_utf8);
+
+    BufferPtr = CurPtr+1;
+    goto LexNextToken;
+  }
   }
 
   // Notify MIOpt that we read a non-whitespace/non-comment token.
Index: lib/Lex/LiteralSupport.cpp
===================================================================
--- lib/Lex/LiteralSupport.cpp
+++ lib/Lex/LiteralSupport.cpp
@@ -130,7 +130,7 @@
     if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) {
       if (Diags)
         Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
-             diag::err_hex_escape_no_digits);
+             diag::err_hex_escape_no_digits) << "x";
       HadError = 1;
       break;
     }
@@ -226,7 +226,7 @@
   if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) {
     if (Diags)
       Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
-           diag::err_ucn_escape_no_digits);
+           diag::err_hex_escape_no_digits) << StringRef(&ThisTokBuf[-1], 1);
     return false;
   }
   UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
Index: lib/Lex/Preprocessor.cpp
===================================================================
--- lib/Lex/Preprocessor.cpp
+++ lib/Lex/Preprocessor.cpp
@@ -27,6 +27,7 @@
 
 #include "clang/Lex/Preprocessor.h"
 #include "MacroArgs.h"
+#include "clang/Basic/ConvertUTF.h"
 #include "clang/Basic/FileManager.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/Basic/TargetInfo.h"
@@ -43,6 +44,8 @@
 #include "clang/Lex/ScratchBuffer.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/Capacity.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
@@ -396,7 +399,7 @@
                                           SmallVectorImpl<char> &Buffer,
                                           bool *Invalid) const {
   // NOTE: this has to be checked *before* testing for an IdentifierInfo.
-  if (Tok.isNot(tok::raw_identifier)) {
+  if (Tok.isNot(tok::raw_identifier) && !Tok.hasUCN()) {
     // Try the fast path.
     if (const IdentifierInfo *II = Tok.getIdentifierInfo())
       return II->getName();
@@ -494,6 +497,48 @@
 // Lexer Event Handling.
 //===----------------------------------------------------------------------===//
 
+static void appendCodePoint(unsigned Codepoint,
+                            llvm::SmallVectorImpl<char> &Str) {
+  char ResultBuf[4];
+  char *ResultPtr = ResultBuf;
+  bool Res = ConvertCodePointToUTF8(Codepoint, ResultPtr);
+  (void)Res;
+  assert(Res && "Unexpected conversion failure");
+  Str.append(ResultBuf, ResultPtr);
+}
+
+static void expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
+  for (StringRef::iterator I = Input.begin(), E = Input.end(); I != E; ++I) {
+    if (*I != '\\') {
+      Buf.push_back(*I);
+      continue;
+    }
+
+    ++I;
+    assert(*I == 'u' || *I == 'U');
+
+    unsigned NumHexDigits;
+    if (*I == 'u')
+      NumHexDigits = 4;
+    else
+      NumHexDigits = 8;
+
+    assert(I + NumHexDigits <= E);
+
+    uint32_t CodePoint = 0;
+    for (++I; NumHexDigits != 0; ++I, --NumHexDigits) {
+      unsigned Value = llvm::hexDigitValue(*I);
+      assert(Value != -1U);
+
+      CodePoint <<= 4;
+      CodePoint += Value;
+    }
+
+    appendCodePoint(CodePoint, Buf);
+    --I;
+  }
+}
+
 /// LookUpIdentifierInfo - Given a tok::raw_identifier token, look up the
 /// identifier information for the token and install it into the token,
 /// updating the token kind accordingly.
@@ -502,15 +547,22 @@
 
   // Look up this token, see if it is a macro, or if it is a language keyword.
   IdentifierInfo *II;
-  if (!Identifier.needsCleaning()) {
+  if (!Identifier.needsCleaning() && !Identifier.hasUCN()) {
     // No cleaning needed, just use the characters from the lexed buffer.
     II = getIdentifierInfo(StringRef(Identifier.getRawIdentifierData(),
-                                           Identifier.getLength()));
+                                     Identifier.getLength()));
   } else {
     // Cleaning needed, alloca a buffer, clean into it, then use the buffer.
     SmallString<64> IdentifierBuffer;
     StringRef CleanedStr = getSpelling(Identifier, IdentifierBuffer);
-    II = getIdentifierInfo(CleanedStr);
+
+    if (Identifier.hasUCN()) {
+      SmallString<64> UCNIdentifierBuffer;
+      expandUCNs(UCNIdentifierBuffer, CleanedStr);
+      II = getIdentifierInfo(UCNIdentifierBuffer);
+    } else {
+      II = getIdentifierInfo(CleanedStr);
+    }
   }
 
   // Update the token info (identifier info and appropriate token kind).
Index: test/CXX/over/over.oper/over.literal/p8.cpp
===================================================================
--- test/CXX/over/over.oper/over.literal/p8.cpp
+++ test/CXX/over/over.oper/over.literal/p8.cpp
@@ -7,8 +7,7 @@
 
 void operator "" _km(long double); // ok
 string operator "" _i18n(const char*, std::size_t); // ok
-// FIXME: This should be accepted once we support UCNs
-template<char...> int operator "" \u03C0(); // ok, UCN for lowercase pi // expected-error {{expected identifier}}
+template<char...> int operator "" \u03C0(); // ok, UCN for lowercase pi // expected-warning {{reserved}}
 float operator ""E(const char *); // expected-error {{invalid suffix on literal}} expected-warning {{reserved}}
 float operator " " B(const char *); // expected-error {{must be '""'}} expected-warning {{reserved}}
 string operator "" 5X(const char *, std::size_t); // expected-error {{expected identifier}}
Index: test/FixIt/fixit-unicode.c
===================================================================
--- test/FixIt/fixit-unicode.c
+++ test/FixIt/fixit-unicode.c
@@ -8,13 +8,15 @@
 // PR13312
 void test1() {
   struct Foo foo;
-  (&foo)☃>bar = 42;
+  foo.bar = 42☃
+// CHECK: error: non-ASCII characters are not allowed outside of literals and identifiers
+// CHECK: {{^              \^}}
 // CHECK: error: expected ';' after expression
 // Make sure we emit the fixit right in front of the snowman.
-// CHECK: {{^        \^}}
-// CHECK: {{^        ;}}
+// CHECK: {{^              \^}}
+// CHECK: {{^              ;}}
 
-// CHECK-MACHINE: fix-it:"{{.*}}fixit-unicode.c":{11:9-11:9}:";"
+// CHECK-MACHINE: fix-it:"{{.*}}fixit-unicode.c":{[[@LINE-8]]:15-[[@LINE-8]]:15}:";"
 }
 
 
@@ -29,5 +31,5 @@
 // because different systems will render the delta differently (either as a
 // character, or as <U+2206>.) The fixit should line up with the %d regardless.
 
-// CHECK-MACHINE: fix-it:"{{.*}}fixit-unicode.c":{23:16-23:18}:"%ld"
+// CHECK-MACHINE: fix-it:"{{.*}}fixit-unicode.c":{[[@LINE-9]]:16-[[@LINE-9]]:18}:"%ld"
 }
Index: test/Preprocessor/ucn-pp-identifier.c
===================================================================
--- /dev/null
+++ test/Preprocessor/ucn-pp-identifier.c
@@ -0,0 +1,81 @@
+// RUN: %clang_cc1 %s -fsyntax-only -std=c99 -pedantic -verify -Wundef
+// RUN: %clang_cc1 %s -fsyntax-only -x c++ -pedantic -verify -Wundef
+// RUN: %clang_cc1 %s -fsyntax-only -std=c99 -pedantic -fdiagnostics-parseable-fixits -Wundef 2>&1 | FileCheck -strict-whitespace %s
+
+#define \u00FC
+#define a\u00FD() 0
+#ifndef \u00FC
+#error "This should never happen"
+#endif
+
+#if a\u00FD()
+#error "This should never happen"
+#endif
+
+#if a\U000000FD()
+#error "This should never happen"
+#endif
+
+#if \uarecool // expected-warning{{incomplete universal character name; treating as '\' followed by identifier}} expected-error {{invalid token at start of a preprocessor expression}}
+#endif
+#if \uwerecool // expected-warning{{\u used with no following hex digits; treating as '\' followed by identifier}} expected-error {{invalid token at start of a preprocessor expression}}
+#endif
+#if \U0001000  // expected-warning{{incomplete universal character name; treating as '\' followed by identifier}} expected-error {{invalid token at start of a preprocessor expression}}
+#endif
+
+// Make sure we reject disallowed UCNs
+#define \ufffe // expected-error {{macro names must be identifiers}}
+#define \U10000000  // expected-error {{macro names must be identifiers}}
+#define \u0061  // expected-error {{character 'a' cannot be specified by a universal character name}} expected-error {{macro names must be identifiers}}
+
+// FIXME: Not clear what our behavior should be here; \u0024 is "$".
+#define a\u0024  // expected-warning {{whitespace}}
+
+#if \u0110 // expected-warning {{is not defined, evaluates to 0}}
+#endif
+
+
+#define \u0110 1 / 0
+#if \u0110 // expected-error {{division by zero in preprocessor expression}}
+#endif
+
+#define STRINGIZE(X) # X
+
+extern int check_size[sizeof(STRINGIZE(\u0112)) == 3 ? 1 : -1];
+
+
+#ifndef __cplusplus
+
+#define newline_1_\u00F\
+C 1
+#define newline_2_\u00\
+F\
+C 1
+#define newline_3_\u\
+00\
+FC 1
+#define newline_4_\\
+u00FC 1
+#define newline_5_\\
+u\
+\
+0\
+0\
+F\
+C 1
+
+#if (newline_1_\u00FC && newline_2_\u00FC && newline_3_\u00FC && \
+     newline_4_\u00FC && newline_5_\u00FC)
+#else
+#error "Line splicing failed to produce UCNs"
+#endif
+
+
+// expected-warning@+1 {{incomplete universal character name}} expected-note@+1 {{did you mean to use '\u'?}} expected-warning@+1 {{whitespace}}
+#define capital_u_\U00FC
+// CHECK: note: did you mean to use '\u'?
+// CHECK-NEXT:   #define capital_u_\U00FC
+// CHECK-NEXT: {{^                   \^}}
+// CHECK-NEXT: {{^                   u}}
+
+#endif
Index: test/Sema/ucn-cstring.c
===================================================================
--- test/Sema/ucn-cstring.c
+++ test/Sema/ucn-cstring.c
@@ -8,7 +8,7 @@
   printf("%s (%zd)\n", "hello \u2192 \u2603 \u2190 world", sizeof("hello \u2192 \u2603 \u2190 world"));
   printf("%s (%zd)\n", "\U00010400\U0001D12B", sizeof("\U00010400\U0001D12B"));
   // Some error conditions...
-  printf("%s\n", "\U"); // expected-error{{\u used with no following hex digits}}
+  printf("%s\n", "\U"); // expected-error{{\U used with no following hex digits}}
   printf("%s\n", "\U00"); // expected-error{{incomplete universal character name}}
   printf("%s\n", "\U0001"); // expected-error{{incomplete universal character name}}
   printf("%s\n", "\u0001"); // expected-error{{universal character name refers to a control character}}
_______________________________________________
cfe-commits mailing list
[email protected]
http://lists.cs.uiuc.edu/mailman/listinfo/cfe-commits

Reply via email to