Index: include/clang/Basic/DiagnosticLexKinds.td
===================================================================
--- include/clang/Basic/DiagnosticLexKinds.td	(revision 168200)
+++ include/clang/Basic/DiagnosticLexKinds.td	(working copy)
@@ -93,8 +93,11 @@
   "multi-character character constant">, InGroup<MultiChar>;
 def ext_four_char_character_literal : Extension<
   "multi-character character constant">, InGroup<FourByteMultiChar>;
-  
 
+// UTF-8 in source file
+def err_bad_encoding : Error<"illegal character encoding">;
+def ext_utf8_whitespace : ExtWarn<"UTF-8 whitespace character">;
+
 // Literal
 def ext_nonstandard_escape : Extension<
   "use of non-standard escape character '\\%0'">;
Index: include/clang/Lex/Lexer.h
===================================================================
--- include/clang/Lex/Lexer.h	(revision 168200)
+++ include/clang/Lex/Lexer.h	(working copy)
@@ -473,7 +473,7 @@
   /// can return false for characters that end up being the same, but it will
   /// never return true for something that needs to be mapped.
   static bool isObviouslySimpleCharacter(char C) {
-    return C != '?' && C != '\\';
+    return C != '?' && C != '\\' && C >= 0;
   }
 
   /// getAndAdvanceChar - Read a single 'character' from the specified buffer,
@@ -481,8 +481,8 @@
   /// just handle the trivial case and fall-back to the non-inlined
   /// getCharAndSizeSlow method to handle the hard case.
   inline char getAndAdvanceChar(const char *&Ptr, Token &Tok) {
-    // If this is not a trigraph and not a UCN or escaped newline, return
-    // quickly.
+    // If this is not a trigraph and not a UCN or escaped newline or UTF8,
+    // return quickly.
     if (isObviouslySimpleCharacter(Ptr[0])) return *Ptr++;
 
     unsigned Size = 0;
Index: lib/Lex/Lexer.cpp
===================================================================
--- lib/Lex/Lexer.cpp	(revision 168200)
+++ lib/Lex/Lexer.cpp	(working copy)
@@ -28,6 +28,7 @@
 #include "clang/Lex/Preprocessor.h"
 #include "clang/Lex/LexDiagnostic.h"
 #include "clang/Lex/CodeCompletionHandler.h"
+#include "clang/Basic/ConvertUTF.h"
 #include "clang/Basic/SourceManager.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/STLExtras.h"
@@ -980,7 +981,8 @@
   CHAR_NUMBER   = 0x08,  // 0-9
   CHAR_UNDER    = 0x10,  // _
   CHAR_PERIOD   = 0x20,  // .
-  CHAR_RAWDEL   = 0x40   // {}[]#<>%:;?*+-/^&|~!=,"'
+  CHAR_RAWDEL   = 0x40,  // {}[]#<>%:;?*+-/^&|~!=,"'
+  CHAR_COMBINING = 0x80  // UTF8 combining character
 };
 
 // Statically initialize CharInfo table based on ASCII character set
@@ -1050,7 +1052,9 @@
 //120  x       121  y        122  z        123  {
 //124  |       125  }        126  ~        127 DEL
    CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_RAWDEL ,
-   CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , 0
+   CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , 0           ,
+//128 UTF8_IdentifierStart   130 UTF8_Identifier
+   CHAR_LETTER               , CHAR_COMBINING
 };
 
 static void InitCharacterInfo() {
@@ -1085,7 +1089,8 @@
 /// isIdentifierBody - Return true if this is the body character of an
 /// identifier, which is [a-zA-Z0-9_].
 static inline bool isIdentifierBody(unsigned char c) {
-  return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER)) ? true : false;
+  return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|
+                         CHAR_UNDER|CHAR_COMBINING)) ? true : false;
 }
 
 /// isHorizontalWhitespace - Return true if this character is horizontal
@@ -1111,8 +1116,8 @@
 /// isNumberBody - Return true if this is the body character of an
 /// preprocessing number, which is [a-zA-Z0-9_.].
 static inline bool isNumberBody(unsigned char c) {
-  return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD)) ?
-    true : false;
+  return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|
+                         CHAR_PERIOD|CHAR_COMBINING)) ? true : false;
 }
 
 /// isRawStringDelimBody - Return true if this is the body character of a
@@ -1333,6 +1338,107 @@
   return TokenLoc.getLocWithOffset(Tok.getLength() + NumWhitespaceChars);
 }
 
+// FIXME: Stolen shamelessly from Eli's UCN patch.
+namespace {
+  struct UCNCharRange {
+    unsigned Lower;
+    unsigned Upper;
+  };
+  UCNCharRange UCNAllowedCharRanges[] =
+      // 1
+    { { 0x00A8, 0x00A8 }, { 0x00AA, 0x00AA }, { 0x00AD, 0x00AD },
+      { 0x00AF, 0x00AF }, { 0x00B2, 0x00B5 }, { 0x00B7, 0x00BA },
+      { 0x00BC, 0x00BE }, { 0x00C0, 0x00D6 }, { 0x00D8, 0x00F6 },
+      { 0x00F8, 0x00FF },
+      // 2
+      { 0x0100, 0x167F }, { 0x1681, 0x180D }, { 0x180F, 0x1FFF },
+      // 3
+      { 0x200B, 0x200D }, { 0x202A, 0x202E }, { 0x203F, 0x2040 },
+      { 0x2054, 0x2054 }, { 0x2060, 0x206F },
+      // 4
+      { 0x2070, 0x218F }, { 0x2460, 0x24FF }, { 0x2776, 0x2793 },
+      { 0x2C00, 0x2DFF }, { 0x2E80, 0x2FFF },
+      // 5
+      { 0x3004, 0x3007 }, { 0x3021, 0x302F }, { 0x3031, 0x303F },
+      // 6
+      { 0x3040, 0xD7FF },
+      // 7
+      { 0xF900, 0xFD3D }, { 0xFD40, 0xFDCF }, { 0xFDF0, 0xFE44 },
+      { 0xFE47, 0xFFFD },
+      // 8
+      { 0x10000, 0x1FFFD }, { 0x20000, 0x2FFFD }, { 0x30000, 0x3FFFD },
+      { 0x40000, 0x4FFFD }, { 0x50000, 0x5FFFD }, { 0x60000, 0x6FFFD },
+      { 0x70000, 0x7FFFD }, { 0x80000, 0x8FFFD }, { 0x90000, 0x9FFFD },
+      { 0xA0000, 0xAFFFD }, { 0xB0000, 0xBFFFD }, { 0xC0000, 0xCFFFD },
+      { 0xD0000, 0xDFFFD }, { 0xE0000, 0xEFFFD } };
+}
+
+static bool isAllowedIDChar(unsigned c) {
+  unsigned LowPoint = 0;
+  unsigned HighPoint = llvm::array_lengthof(UCNAllowedCharRanges);
+  while (HighPoint != LowPoint) {
+    unsigned MidPoint = (HighPoint + LowPoint) / 2;
+    if (c < UCNAllowedCharRanges[MidPoint].Lower)
+      HighPoint = MidPoint;
+    else if (c > UCNAllowedCharRanges[MidPoint].Upper)
+      LowPoint = MidPoint + 1;
+    else
+      return true;
+  }
+  return false;
+}
+
+static bool isAllowedInitiallyIDChar(unsigned c) {
+  return //isAllowedIDChar(c) &&
+         !(0x0300 <= c && c <= 0x036F) &&
+         !(0x1DC0 <= c && c <= 0x1DFF) &&
+         !(0x20D0 <= c && c <= 0x20FF) &&
+         !(0xFE20 <= c && c <= 0xFE2F);
+}
+
+/// Classifications of UTF8 characters.
+enum UTF8CharKind {
+  /// A character which can appear at the start of an identifier.
+  UTF8_IdentifierStart = (char)0x80,
+  /// A character which can appear within an identifier.
+  UTF8_Identifier = (char)0x81,
+  /// A character which we will treat as whitespace.
+  UTF8_Whitespace = (char)0x82,
+  /// Any other valid encoding.
+  UTF8_Unknown = (char)0x83,
+  /// An invalid encoding.
+  UTF8_Invalid = (char)0x84
+};
+
+static UTF8CharKind getUTF8CharAndSize(const char *Ptr, unsigned &Size) {
+  const UTF8 *CurPtr = reinterpret_cast<const UTF8*>(Ptr);
+  assert(*CurPtr & 0x80);
+
+  UTF32 Value;
+  UTF32 *ValuePtr = &Value;
+  Size = getNumBytesForUTF8(*CurPtr);
+  if (ConvertUTF8toUTF32(&CurPtr, CurPtr + Size, &ValuePtr, &Value + 1,
+                         strictConversion) != conversionOK) {
+    // Try to resync to the input stream, by skipping all high bytes.
+    do { ++CurPtr; } while (*CurPtr & 0x80);
+    Size = reinterpret_cast<const char*>(CurPtr) - Ptr;
+    return UTF8_Invalid;
+  }
+
+  if (Value == 0x0085 || Value == 0x00A0 || Value == 0x1680 ||
+      Value == 0x180E || (Value >= 0x2000 && Value <= 0x200A) ||
+      Value == 0x2028 || Value == 0x2029 || Value == 0x202F ||
+      Value == 0x205F || Value == 0x3000) {
+    return UTF8_Whitespace;
+  }
+
+  if (!isAllowedIDChar(Value))
+    return UTF8_Unknown;
+
+  return isAllowedInitiallyIDChar(Value) ? UTF8_IdentifierStart
+                                         : UTF8_Identifier;
+}
+
 /// getCharAndSizeSlow - Peek a single 'character' from the specified buffer,
 /// get its size, and return it.  This is tricky in several cases:
 ///   1. If currently at the start of a trigraph, we warn about the trigraph,
@@ -1403,6 +1509,18 @@
     }
   }
 
+  // Convert UTF-8 sequences to a classification of the codepoint.
+  if (Ptr[0] < 0) {
+    UTF8CharKind CK = getUTF8CharAndSize(Ptr, Size);
+    // FIXME: We want to allow bad encodings in character and string literals
+    // (for compatibility), but not in the ud-suffix of a literal, and not in
+    // numeric literals.
+    if (CK == UTF8_Invalid && Tok && !isLexingRawMode())
+      Diag(Ptr, diag::err_bad_encoding)
+        << SourceRange(getSourceLocation(Ptr), getSourceLocation(Ptr + Size));
+    return CK;
+  }
+
   // If this is neither, return a single character.
   ++Size;
   return *Ptr;
@@ -1457,6 +1575,10 @@
     }
   }
 
+  // Convert UTF-8 sequences to a classification of the codepoint.
+  if (Ptr[0] < 0)
+    return getUTF8CharAndSize(Ptr, Size);
+
   // If this is neither, return a single character.
   ++Size;
   return *Ptr;
@@ -1489,7 +1611,8 @@
   //
   // TODO: Could merge these checks into a CharInfo flag to make the comparison
   // cheaper
-  if (C != '\\' && C != '?' && (C != '$' || !LangOpts.DollarIdents)) {
+  if (C != '\\' && C != '?' && (C != '$' || !LangOpts.DollarIdents) &&
+      C < 0x80) {
 FinishIdentifier:
     const char *IdStart = BufferPtr;
     FormTokenWithChars(Result, CurPtr, tok::raw_identifier);
@@ -1529,11 +1652,11 @@
     } else if (!isIdentifierBody(C)) { // FIXME: UCNs.
       // Found end of identifier.
       goto FinishIdentifier;
+    } else {
+      // Otherwise, this character is good, consume it.
+      CurPtr = ConsumeChar(CurPtr, Size, Result);
     }
 
-    // Otherwise, this character is good, consume it.
-    CurPtr = ConsumeChar(CurPtr, Size, Result);
-
     C = getCharAndSize(CurPtr, Size);
     while (isIdentifierBody(C)) { // FIXME: UCNs.
       CurPtr = ConsumeChar(CurPtr, Size, Result);
@@ -2664,6 +2787,10 @@
     if (SkipWhitespace(Result, CurPtr))
       return; // KeepWhitespaceMode
     goto LexNextToken;   // GCC isn't tail call eliminating.
+  case UTF8_Whitespace:
+    Diag(BufferPtr, diag::ext_utf8_whitespace)
+      << SourceRange(getSourceLocation(BufferPtr), getSourceLocation(CurPtr));
+    // Fall through
   case ' ':
   case '\t':
   case '\f':
@@ -2829,7 +2956,7 @@
   case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
   case 'o': case 'p': case 'q': case 'r': case 's': case 't':    /*'u'*/
   case 'v': case 'w': case 'x': case 'y': case 'z':
-  case '_':
+  case '_': case UTF8_IdentifierStart:
     // Notify MIOpt that we read a non-whitespace/non-comment token.
     MIOpt.ReadToken();
     return LexIdentifier(Result, CurPtr);
